Skip to content

canvod.grids API Reference

Hemispheric grid implementations and spatial analysis tools.

Package

HEALPix and hemispheric grid operations.

Provides hemisphere grid structures for GNSS signal observation analysis.

BaseGridBuilder

Bases: ABC

Abstract base for hemispherical grid builders.

Parameters

angular_resolution : float Angular resolution in degrees cutoff_theta : float Maximum polar angle cutoff in degrees phi_rotation : float Rotation angle in degrees (applied to all phi values)

Source code in packages/canvod-grids/src/canvod/grids/core/grid_builder.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
class BaseGridBuilder(ABC):
    """Abstract base for hemispherical grid builders.

    Parameters
    ----------
    angular_resolution : float
        Angular resolution in degrees
    cutoff_theta : float
        Maximum polar angle cutoff in degrees
    phi_rotation : float
        Rotation angle in degrees (applied to all phi values)

    """

    def __init__(
        self,
        angular_resolution: float = 2,
        cutoff_theta: float = 0,
        phi_rotation: float = 0,
    ) -> None:
        """Initialize the grid builder.

        Parameters
        ----------
        angular_resolution : float, default 2
            Angular resolution in degrees.
        cutoff_theta : float, default 0
            Maximum polar angle cutoff in degrees.
        phi_rotation : float, default 0
            Rotation angle in degrees.

        """
        self.angular_resolution = angular_resolution
        self.angular_resolution_rad = np.deg2rad(angular_resolution)
        self.cutoff_theta = cutoff_theta
        self.cutoff_theta_rad = np.deg2rad(cutoff_theta)
        self.phi_rotation = phi_rotation
        self.phi_rotation_rad = np.deg2rad(phi_rotation)
        self._logger = _get_logger()

    @abstractmethod
    def _build_grid(
        self,
    ) -> (
        tuple[pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray]]
        | tuple[
            pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray], dict[str, Any]
        ]
    ):
        """Build grid.

        Returns
        -------
        grid : pl.DataFrame
            Grid cells
        theta_lims : np.ndarray
            Theta band limits
        phi_lims : list[np.ndarray]
            Phi limits per band
        cell_ids : list[np.ndarray]
            Cell IDs per band
        extra_kwargs : dict, optional
            Additional metadata

        """

    @abstractmethod
    def get_grid_type(self) -> str:
        """Get grid type identifier."""

    def build(self) -> GridData:
        """Build hemisphere grid.

        Returns
        -------
        GridData
            Complete grid data structure

        """
        self._logger.info(
            "grid_build_started",
            grid_type=self.get_grid_type(),
            angular_resolution=self.angular_resolution,
        )

        result = self._build_grid()

        if len(result) == 4:
            grid, theta_lims, phi_lims, cell_ids = result
            extra_kwargs = {}
        elif len(result) == 5:
            grid, theta_lims, phi_lims, cell_ids, extra_kwargs = result
        else:
            raise ValueError(f"Invalid grid builder result: {len(result)} elements")

        # Apply phi rotation if specified (vectorized operations)
        if self.phi_rotation_rad != 0:
            grid = grid.with_columns(
                [(pl.col("phi") + self.phi_rotation_rad) % (2 * np.pi)]
            )

            if "phi_min" in grid.columns:
                grid = grid.with_columns(
                    [
                        (
                            (pl.col("phi_min") + self.phi_rotation_rad) % (2 * np.pi)
                        ).alias("phi_min"),
                        (
                            (pl.col("phi_max") + self.phi_rotation_rad) % (2 * np.pi)
                        ).alias("phi_max"),
                    ]
                )

        self._logger.info("grid_build_complete", ncells=len(grid))

        return GridData(
            grid=grid,
            theta_lims=theta_lims,
            phi_lims=phi_lims,
            cell_ids=cell_ids,
            grid_type=self.get_grid_type(),
            **extra_kwargs,
        )

__init__(angular_resolution=2, cutoff_theta=0, phi_rotation=0)

Initialize the grid builder.

Parameters

angular_resolution : float, default 2 Angular resolution in degrees. cutoff_theta : float, default 0 Maximum polar angle cutoff in degrees. phi_rotation : float, default 0 Rotation angle in degrees.

Source code in packages/canvod-grids/src/canvod/grids/core/grid_builder.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def __init__(
    self,
    angular_resolution: float = 2,
    cutoff_theta: float = 0,
    phi_rotation: float = 0,
) -> None:
    """Initialize the grid builder.

    Parameters
    ----------
    angular_resolution : float, default 2
        Angular resolution in degrees.
    cutoff_theta : float, default 0
        Maximum polar angle cutoff in degrees.
    phi_rotation : float, default 0
        Rotation angle in degrees.

    """
    self.angular_resolution = angular_resolution
    self.angular_resolution_rad = np.deg2rad(angular_resolution)
    self.cutoff_theta = cutoff_theta
    self.cutoff_theta_rad = np.deg2rad(cutoff_theta)
    self.phi_rotation = phi_rotation
    self.phi_rotation_rad = np.deg2rad(phi_rotation)
    self._logger = _get_logger()

get_grid_type() abstractmethod

Get grid type identifier.

Source code in packages/canvod-grids/src/canvod/grids/core/grid_builder.py
84
85
86
@abstractmethod
def get_grid_type(self) -> str:
    """Get grid type identifier."""

build()

Build hemisphere grid.

Returns

GridData Complete grid data structure

Source code in packages/canvod-grids/src/canvod/grids/core/grid_builder.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def build(self) -> GridData:
    """Build hemisphere grid.

    Returns
    -------
    GridData
        Complete grid data structure

    """
    self._logger.info(
        "grid_build_started",
        grid_type=self.get_grid_type(),
        angular_resolution=self.angular_resolution,
    )

    result = self._build_grid()

    if len(result) == 4:
        grid, theta_lims, phi_lims, cell_ids = result
        extra_kwargs = {}
    elif len(result) == 5:
        grid, theta_lims, phi_lims, cell_ids, extra_kwargs = result
    else:
        raise ValueError(f"Invalid grid builder result: {len(result)} elements")

    # Apply phi rotation if specified (vectorized operations)
    if self.phi_rotation_rad != 0:
        grid = grid.with_columns(
            [(pl.col("phi") + self.phi_rotation_rad) % (2 * np.pi)]
        )

        if "phi_min" in grid.columns:
            grid = grid.with_columns(
                [
                    (
                        (pl.col("phi_min") + self.phi_rotation_rad) % (2 * np.pi)
                    ).alias("phi_min"),
                    (
                        (pl.col("phi_max") + self.phi_rotation_rad) % (2 * np.pi)
                    ).alias("phi_max"),
                ]
            )

    self._logger.info("grid_build_complete", ncells=len(grid))

    return GridData(
        grid=grid,
        theta_lims=theta_lims,
        phi_lims=phi_lims,
        cell_ids=cell_ids,
        grid_type=self.get_grid_type(),
        **extra_kwargs,
    )

GridData dataclass

Immutable container for hemispherical grid structure.

Parameters

grid : pl.DataFrame Grid cells with phi, theta, and bounds theta_lims : np.ndarray Theta band limits phi_lims : list[np.ndarray] Phi limits per theta band cell_ids : list[np.ndarray] Cell IDs per theta band grid_type : str Grid type identifier solid_angles : np.ndarray, optional Solid angles per cell [steradians] metadata : dict, optional Additional grid metadata voronoi : Any, optional Voronoi tessellation object (for Fibonacci grids) vertices : np.ndarray, optional 3D vertices (for triangular grids) points_xyz : np.ndarray, optional 3D point cloud (for Fibonacci grids) vertex_phi : np.ndarray, optional Vertex phi coordinates vertex_theta : np.ndarray, optional Vertex theta coordinates

Source code in packages/canvod-grids/src/canvod/grids/core/grid_data.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
@dataclass(frozen=True)
class GridData:
    """Immutable container for hemispherical grid structure.

    Parameters
    ----------
    grid : pl.DataFrame
        Grid cells with phi, theta, and bounds
    theta_lims : np.ndarray
        Theta band limits
    phi_lims : list[np.ndarray]
        Phi limits per theta band
    cell_ids : list[np.ndarray]
        Cell IDs per theta band
    grid_type : str
        Grid type identifier
    solid_angles : np.ndarray, optional
        Solid angles per cell [steradians]
    metadata : dict, optional
        Additional grid metadata
    voronoi : Any, optional
        Voronoi tessellation object (for Fibonacci grids)
    vertices : np.ndarray, optional
        3D vertices (for triangular grids)
    points_xyz : np.ndarray, optional
        3D point cloud (for Fibonacci grids)
    vertex_phi : np.ndarray, optional
        Vertex phi coordinates
    vertex_theta : np.ndarray, optional
        Vertex theta coordinates

    """

    grid: pl.DataFrame
    theta_lims: np.ndarray
    phi_lims: list[np.ndarray]
    cell_ids: list[np.ndarray]
    grid_type: str
    solid_angles: np.ndarray | None = None
    metadata: dict | None = None
    voronoi: Any | None = None
    vertices: np.ndarray | None = None
    points_xyz: np.ndarray | None = None
    vertex_phi: np.ndarray | None = None
    vertex_theta: np.ndarray | None = None

    @property
    def coords(self) -> pl.DataFrame:
        """Get cell coordinates."""
        return self.grid.select(["phi", "theta"])

    @property
    def ncells(self) -> int:
        """Number of cells in grid."""
        return len(self.grid)

    def get_patches(self) -> pl.Series:
        """Create matplotlib patches for polar visualization."""
        patches = [
            Rectangle(
                (row["phi_min"], row["theta_min"]),
                row["phi_max"] - row["phi_min"],
                row["theta_max"] - row["theta_min"],
                fill=True,
            )
            for row in self.grid.iter_rows(named=True)
        ]
        return pl.Series("Patches", patches)

    def get_solid_angles(self) -> np.ndarray:
        """Calculate solid angle for each cell [steradians]."""
        if self.solid_angles is not None:
            return self.solid_angles

        # HEALPix
        if self.grid_type == "healpix" and "healpix_nside" in self.grid.columns:
            try:
                import healpy as hp

                nside = int(self.grid["healpix_nside"][0])
                return np.full(
                    len(self.grid), hp.nside2pixarea(nside), dtype=np.float64
                )
            except ImportError:
                pass

        # Geodesic
        if self.grid_type == "geodesic" and "geodesic_vertices" in self.grid.columns:
            return self._compute_geodesic_solid_angles()

        # HTM
        if self.grid_type == "htm" and "htm_vertex_0" in self.grid.columns:
            return self._compute_htm_solid_angles()

        # Fibonacci
        if self.grid_type == "fibonacci" and "voronoi_region" in self.grid.columns:
            return self._compute_voronoi_solid_angles()

        # Default
        return self._geometric_solid_angles()

    def _compute_htm_solid_angles(self) -> np.ndarray:
        """Compute solid angles for HTM triangular cells."""
        solid_angles = []

        for row in self.grid.iter_rows(named=True):
            v0 = np.array(row["htm_vertex_0"])
            v1 = np.array(row["htm_vertex_1"])
            v2 = np.array(row["htm_vertex_2"])

            # Spherical excess formula
            a = np.arccos(np.clip(np.dot(v1, v2), -1, 1))
            b = np.arccos(np.clip(np.dot(v0, v2), -1, 1))
            c = np.arccos(np.clip(np.dot(v0, v1), -1, 1))

            s = (a + b + c) / 2
            tan_E_4 = np.sqrt(
                np.tan(s / 2)
                * np.tan((s - a) / 2)
                * np.tan((s - b) / 2)
                * np.tan((s - c) / 2)
            )
            E = 4 * np.arctan(tan_E_4)

            solid_angles.append(E)

        return np.array(solid_angles)

    def _compute_geodesic_solid_angles(self) -> np.ndarray:
        """Compute solid angles for geodesic triangular cells."""
        vertices = self.vertices
        if vertices is None:
            return self._geometric_solid_angles()

        solid_angles = []
        for row in self.grid.iter_rows(named=True):
            v_indices = row["geodesic_vertices"]
            v0, v1, v2 = vertices[v_indices]

            a = np.arccos(np.clip(np.dot(v1, v2), -1, 1))
            b = np.arccos(np.clip(np.dot(v0, v2), -1, 1))
            c = np.arccos(np.clip(np.dot(v0, v1), -1, 1))

            s = (a + b + c) / 2
            tan_E_4 = np.sqrt(
                np.tan(s / 2)
                * np.tan((s - a) / 2)
                * np.tan((s - b) / 2)
                * np.tan((s - c) / 2)
            )
            E = 4 * np.arctan(tan_E_4)

            solid_angles.append(E)

        return np.array(solid_angles)

    def _compute_voronoi_solid_angles(self) -> np.ndarray:
        """Compute solid angles for Voronoi cells."""
        if self.voronoi is None:
            return self._geometric_solid_angles()

        sv = self.voronoi
        solid_angles = []
        for row in self.grid.iter_rows(named=True):
            region = row["voronoi_region"]
            if len(region) < 3:
                solid_angles.append(np.nan)
                continue

            vertices = sv.vertices[region]
            center = np.array(
                [
                    np.sin(row["theta"]) * np.cos(row["phi"]),
                    np.sin(row["theta"]) * np.sin(row["phi"]),
                    np.cos(row["theta"]),
                ]
            )

            total_angle = 0
            n = len(vertices)
            for i in range(n):
                v1 = vertices[i]
                v2 = vertices[(i + 1) % n]
                a = np.arccos(np.clip(np.dot(center, v1), -1, 1))
                b = np.arccos(np.clip(np.dot(center, v2), -1, 1))
                c = np.arccos(np.clip(np.dot(v1, v2), -1, 1))
                s = (a + b + c) / 2
                tan_E_4 = np.sqrt(
                    np.tan(s / 2)
                    * np.tan((s - a) / 2)
                    * np.tan((s - b) / 2)
                    * np.tan((s - c) / 2)
                )
                E = 4 * np.arctan(tan_E_4)
                total_angle += E
            solid_angles.append(total_angle)

        return np.array(solid_angles)

    def _geometric_solid_angles(self) -> np.ndarray:
        """Fallback geometric calculation."""
        solid_angles = []
        for row in self.grid.iter_rows(named=True):
            delta_phi = row["phi_max"] - row["phi_min"]
            cos_diff = np.cos(row["theta_min"]) - np.cos(row["theta_max"])
            omega = delta_phi * cos_diff
            solid_angles.append(omega)
        return np.array(solid_angles)

    def get_grid_stats(self) -> dict:
        """Get grid statistics including solid angle uniformity."""
        solid_angles = self.get_solid_angles()

        stats = {
            "total_cells": self.ncells,
            "grid_type": self.grid_type,
            "theta_bands": len(self.theta_lims),
            "cells_per_band": [len(ids) for ids in self.cell_ids],
            "solid_angle_mean_sr": float(np.mean(solid_angles)),
            "solid_angle_std_sr": float(np.std(solid_angles)),
            "solid_angle_cv_percent": float(
                np.std(solid_angles) / np.mean(solid_angles) * 100
            ),
            "total_solid_angle_sr": float(np.sum(solid_angles)),
            "hemisphere_solid_angle_sr": 2 * np.pi,
        }

        # Add HEALPix-specific info
        if self.grid_type == "healpix" and "healpix_nside" in self.grid.columns:
            try:
                import healpy as hp

                nside = int(self.grid["healpix_nside"][0])
                stats["healpix_nside"] = nside
                stats["healpix_npix_total"] = hp.nside2npix(nside)
                stats["healpix_pixel_area_sr"] = hp.nside2pixarea(nside)
                stats["healpix_resolution_arcmin"] = hp.nside2resol(nside, arcmin=True)
            except ImportError:
                pass

        return stats

coords property

Get cell coordinates.

ncells property

Number of cells in grid.

get_patches()

Create matplotlib patches for polar visualization.

Source code in packages/canvod-grids/src/canvod/grids/core/grid_data.py
67
68
69
70
71
72
73
74
75
76
77
78
def get_patches(self) -> pl.Series:
    """Create matplotlib patches for polar visualization."""
    patches = [
        Rectangle(
            (row["phi_min"], row["theta_min"]),
            row["phi_max"] - row["phi_min"],
            row["theta_max"] - row["theta_min"],
            fill=True,
        )
        for row in self.grid.iter_rows(named=True)
    ]
    return pl.Series("Patches", patches)

get_solid_angles()

Calculate solid angle for each cell [steradians].

Source code in packages/canvod-grids/src/canvod/grids/core/grid_data.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def get_solid_angles(self) -> np.ndarray:
    """Calculate solid angle for each cell [steradians]."""
    if self.solid_angles is not None:
        return self.solid_angles

    # HEALPix
    if self.grid_type == "healpix" and "healpix_nside" in self.grid.columns:
        try:
            import healpy as hp

            nside = int(self.grid["healpix_nside"][0])
            return np.full(
                len(self.grid), hp.nside2pixarea(nside), dtype=np.float64
            )
        except ImportError:
            pass

    # Geodesic
    if self.grid_type == "geodesic" and "geodesic_vertices" in self.grid.columns:
        return self._compute_geodesic_solid_angles()

    # HTM
    if self.grid_type == "htm" and "htm_vertex_0" in self.grid.columns:
        return self._compute_htm_solid_angles()

    # Fibonacci
    if self.grid_type == "fibonacci" and "voronoi_region" in self.grid.columns:
        return self._compute_voronoi_solid_angles()

    # Default
    return self._geometric_solid_angles()

get_grid_stats()

Get grid statistics including solid angle uniformity.

Source code in packages/canvod-grids/src/canvod/grids/core/grid_data.py
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
def get_grid_stats(self) -> dict:
    """Get grid statistics including solid angle uniformity."""
    solid_angles = self.get_solid_angles()

    stats = {
        "total_cells": self.ncells,
        "grid_type": self.grid_type,
        "theta_bands": len(self.theta_lims),
        "cells_per_band": [len(ids) for ids in self.cell_ids],
        "solid_angle_mean_sr": float(np.mean(solid_angles)),
        "solid_angle_std_sr": float(np.std(solid_angles)),
        "solid_angle_cv_percent": float(
            np.std(solid_angles) / np.mean(solid_angles) * 100
        ),
        "total_solid_angle_sr": float(np.sum(solid_angles)),
        "hemisphere_solid_angle_sr": 2 * np.pi,
    }

    # Add HEALPix-specific info
    if self.grid_type == "healpix" and "healpix_nside" in self.grid.columns:
        try:
            import healpy as hp

            nside = int(self.grid["healpix_nside"][0])
            stats["healpix_nside"] = nside
            stats["healpix_npix_total"] = hp.nside2npix(nside)
            stats["healpix_pixel_area_sr"] = hp.nside2pixarea(nside)
            stats["healpix_resolution_arcmin"] = hp.nside2resol(nside, arcmin=True)
        except ImportError:
            pass

    return stats

GridType

Bases: Enum

Available grid projection types for hemispherical tessellation.

Source code in packages/canvod-grids/src/canvod/grids/core/grid_types.py
 6
 7
 8
 9
10
11
12
13
14
15
class GridType(Enum):
    """Available grid projection types for hemispherical tessellation."""

    EQUAL_AREA = "equal_area"  # Equal solid angle (ring-based)
    EQUAL_ANGLE = "equal_angle"  # Equal angular spacing
    EQUIRECTANGULAR = "equirectangular"  # Simple rectangular
    HEALPIX = "healpix"  # Hierarchical equal area
    GEODESIC = "geodesic"  # Icosahedral triangular
    FIBONACCI = "fibonacci"  # Golden spiral + Voronoi
    HTM = "htm"  # Hierarchical Triangular Mesh

EqualAreaBuilder

Bases: BaseGridBuilder

Equal solid angle tessellation using concentric theta bands.

The hemisphere is divided into annular bands of constant width in theta. Within each band the number of azimuthal (phi) sectors is chosen so that every cell subtends approximately the same solid angle. This is the only grid type that has been validated for scientific use in this codebase.

Coordinate convention (physics / GNSS)

  • phi ∈ [0, 2π) – azimuthal angle from North, clockwise (navigation convention)
  • theta ∈ [0, π/2] – polar angle measured from zenith (0 = straight up, π/2 = horizon)

What angular_resolution means

angular_resolution (degrees) sets the width of each theta band. All bands have this same width Δθ. The azimuthal width of cells varies by band: near the zenith cells are wide in phi; near the horizon they are narrow, so that the solid angle stays constant.

Mathematical construction

  1. Target solid angle per cell is chosen equal to the solid angle of a cap of half-angle Δθ/2::

    Ω_target = 2π (1 − cos(Δθ/2))

  2. Zenith cap – a single cell covers [0, Δθ/2] in theta and the full azimuth [0, 2π).

  3. Theta bands – edges are placed at Δθ/2, 3Δθ/2, 5Δθ/2, … up to π/2 − cutoff_theta. For each band [θ_inner, θ_outer] the band's total solid angle is::

    Ω_band = 2π (cos θ_inner − cos θ_outer)

  4. Phi divisions – the number of sectors in the band is::

    n_phi = round(Ω_band / Ω_target)

Each sector spans Δφ = 2π / n_phi. The cell centre is placed at the geometric midpoint of its (phi, theta) rectangle.

Parameters

angular_resolution : float Theta-band width in degrees. Controls both the radial resolution and (indirectly, via the equal-area constraint) the azimuthal resolution. cutoff_theta : float Minimum elevation above the horizon in degrees. Bands whose outer edge is at or below this cutoff are omitted. In GNSS terms this is the satellite elevation mask angle. phi_rotation : float Rigid rotation applied to all phi coordinates after grid construction, in degrees.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/equal_area_grid.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
class EqualAreaBuilder(BaseGridBuilder):
    """Equal solid angle tessellation using concentric theta bands.

    The hemisphere is divided into annular bands of constant width in theta.
    Within each band the number of azimuthal (phi) sectors is chosen so that
    every cell subtends approximately the same solid angle.  This is the only
    grid type that has been validated for scientific use in this codebase.

    Coordinate convention (physics / GNSS)
    ---------------------------------------
    * phi  ∈ [0, 2π)  – azimuthal angle from North, clockwise (navigation convention)
    * theta ∈ [0, π/2] – polar angle measured from zenith (0 = straight up,
      π/2 = horizon)

    What ``angular_resolution`` means
    ----------------------------------
    ``angular_resolution`` (degrees) sets the **width of each theta band**.
    All bands have this same width Δθ.  The *azimuthal* width of cells varies
    by band: near the zenith cells are wide in phi; near the horizon they are
    narrow, so that the solid angle stays constant.

    Mathematical construction
    -------------------------
    1. **Target solid angle** per cell is chosen equal to the solid angle of a
       cap of half-angle Δθ/2::

           Ω_target = 2π (1 − cos(Δθ/2))

    2. **Zenith cap** – a single cell covers [0, Δθ/2] in theta and the full
       azimuth [0, 2π).

    3. **Theta bands** – edges are placed at Δθ/2, 3Δθ/2, 5Δθ/2, … up to
       π/2 − cutoff_theta.  For each band [θ_inner, θ_outer] the band's
       total solid angle is::

           Ω_band = 2π (cos θ_inner − cos θ_outer)

    4. **Phi divisions** – the number of sectors in the band is::

           n_phi = round(Ω_band / Ω_target)

       Each sector spans Δφ = 2π / n_phi.  The cell centre is placed at the
       geometric midpoint of its (phi, theta) rectangle.

    Parameters
    ----------
    angular_resolution : float
        Theta-band width in degrees.  Controls both the radial resolution and
        (indirectly, via the equal-area constraint) the azimuthal resolution.
    cutoff_theta : float
        Minimum elevation above the horizon in degrees.  Bands whose outer
        edge is at or below this cutoff are omitted.  In GNSS terms this is
        the satellite elevation mask angle.
    phi_rotation : float
        Rigid rotation applied to all phi coordinates after grid construction,
        in degrees.

    """

    def get_grid_type(self) -> str:
        """Return the grid-type identifier string.

        Returns
        -------
        str
            ``"equal_area"``

        """
        return GridType.EQUAL_AREA.value

    def _build_grid(
        self,
    ) -> tuple[pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray]]:
        """Construct the equal-area hemisphere grid.

        Returns
        -------
        grid : pl.DataFrame
            One row per cell with columns: phi, theta, phi_min, phi_max,
            theta_min, theta_max, cell_id.
        theta_lims : np.ndarray
            Outer theta edge of each band (radians).
        phi_lims : list[np.ndarray]
            Array of phi_min values for each band.
        cell_ids : list[np.ndarray]
            Cell-id arrays, one per band.

        """
        # Theta band edges (from zenith to horizon)
        max_theta = np.pi / 2  # horizon
        theta_edges = np.arange(
            self.angular_resolution_rad / 2,
            max_theta - self.cutoff_theta_rad,
            self.angular_resolution_rad,
        )

        # Target solid angle per cell
        target_omega = 2 * np.pi * (1 - np.cos(self.angular_resolution_rad / 2))

        cells = []
        theta_lims = []
        phi_lims = []
        cell_ids = []

        # Zenith cell (special case) - only if cutoff allows
        next_cell_id = 0
        zenith_theta_max = self.angular_resolution_rad / 2

        if self.cutoff_theta_rad < zenith_theta_max:
            cells.append(
                pl.DataFrame(
                    {
                        "phi": [0.0],
                        "theta": [0.0],
                        "phi_min": [0.0],
                        "phi_max": [2 * np.pi],
                        "theta_min": [max(0.0, self.cutoff_theta_rad)],
                        "theta_max": [zenith_theta_max],
                    }
                )
            )
            theta_lims.append(zenith_theta_max)
            phi_lims.append(np.array([0.0]))
            cell_ids.append(np.array([0]))
            next_cell_id = 1

        # Build theta bands
        for iband, theta_outer in enumerate(theta_edges[1:]):
            theta_inner = theta_edges[iband]

            # Skip bands below cutoff
            if theta_outer <= self.cutoff_theta_rad:
                continue

            # Solid angle of this band
            band_omega = 2 * np.pi * (np.cos(theta_inner) - np.cos(theta_outer))

            # Number of phi divisions
            n_phi = max(1, round(band_omega / target_omega))
            phi_span = 2 * np.pi / n_phi

            cell_id_list = list(range(next_cell_id, next_cell_id + n_phi))
            next_cell_id = cell_id_list[-1] + 1

            # Use arange for better precision than linspace
            phi_min_arr = np.arange(n_phi) * phi_span
            phi_max_arr = (np.arange(n_phi) + 1) * phi_span
            phi_max_arr[-1] = 2 * np.pi  # Force exact closure

            cells.append(
                pl.DataFrame(
                    {
                        "phi": (phi_min_arr + phi_max_arr) / 2,
                        "theta": np.full(n_phi, (theta_inner + theta_outer) / 2),
                        "phi_min": phi_min_arr,
                        "phi_max": phi_max_arr,
                        "theta_min": np.full(n_phi, theta_inner),
                        "theta_max": np.full(n_phi, theta_outer),
                    }
                )
            )

            theta_lims.append(theta_outer)
            phi_lims.append(phi_min_arr)
            cell_ids.append(np.array(cell_id_list))

        if len(cells) == 0:
            raise ValueError(
                "No cells generated - check cutoff_theta and angular_resolution"
            )

        grid = pl.concat(cells).with_columns(pl.int_range(0, pl.len()).alias("cell_id"))

        return grid, np.array(theta_lims), phi_lims, cell_ids

get_grid_type()

Return the grid-type identifier string.

Returns

str "equal_area"

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/equal_area_grid.py
68
69
70
71
72
73
74
75
76
77
def get_grid_type(self) -> str:
    """Return the grid-type identifier string.

    Returns
    -------
    str
        ``"equal_area"``

    """
    return GridType.EQUAL_AREA.value

EqualAngleBuilder

Bases: BaseGridBuilder

Equal angular spacing in both theta and phi (NOT equal area).

Every cell is a rectangle of the same angular size Δθ × Δφ in the (theta, phi) parameter space. Because solid angle depends on cos(theta), cells near the zenith subtend more solid angle than cells near the horizon. This makes the grid biased toward the zenith for any solid-angle-weighted statistic. Not recommended for scientific analysis – use EqualAreaBuilder instead.

Coordinate convention (physics / GNSS)

  • phi ∈ [0, 2π) – azimuthal angle from North, clockwise (navigation convention)
  • theta ∈ [0, π/2] – polar angle from zenith

What angular_resolution means

angular_resolution (degrees) is used as both the theta-band width and the phi-sector width. The number of phi divisions is constant across all bands::

n_phi = round(2π / Δθ)

and does not change with latitude.

Mathematical construction

  1. A zenith cap cell covers [0, Δθ/2] × [0, 2π).
  2. Theta band edges are placed at Δθ/2, 3Δθ/2, … up to π/2.
  3. Within every band, the full azimuth is split into n_phi sectors of equal width Δφ = 2π / n_phi.
  4. Cell centres are at the midpoint of each (phi, theta) rectangle.

Parameters

angular_resolution : float Angular spacing in degrees, applied identically in both theta and phi. cutoff_theta : float Elevation mask angle in degrees (bands below this are omitted). phi_rotation : float Rigid azimuthal rotation applied after construction, in degrees.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/equal_angle_grid.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
class EqualAngleBuilder(BaseGridBuilder):
    """Equal angular spacing in both theta and phi (NOT equal area).

    Every cell is a rectangle of the same angular size Δθ × Δφ in the
    (theta, phi) parameter space.  Because solid angle depends on cos(theta),
    cells near the zenith subtend *more* solid angle than cells near the
    horizon.  This makes the grid biased toward the zenith for any
    solid-angle-weighted statistic.  **Not recommended for scientific
    analysis** – use ``EqualAreaBuilder`` instead.

    Coordinate convention (physics / GNSS)
    ---------------------------------------
    * phi  ∈ [0, 2π)  – azimuthal angle from North, clockwise (navigation convention)
    * theta ∈ [0, π/2] – polar angle from zenith

    What ``angular_resolution`` means
    ----------------------------------
    ``angular_resolution`` (degrees) is used as **both** the theta-band width
    and the phi-sector width.  The number of phi divisions is constant across
    all bands::

        n_phi = round(2π / Δθ)

    and does not change with latitude.

    Mathematical construction
    -------------------------
    1. A zenith cap cell covers [0, Δθ/2] × [0, 2π).
    2. Theta band edges are placed at Δθ/2, 3Δθ/2, … up to π/2.
    3. Within every band, the full azimuth is split into ``n_phi`` sectors of
       equal width Δφ = 2π / n_phi.
    4. Cell centres are at the midpoint of each (phi, theta) rectangle.

    Parameters
    ----------
    angular_resolution : float
        Angular spacing in degrees, applied identically in both theta and phi.
    cutoff_theta : float
        Elevation mask angle in degrees (bands below this are omitted).
    phi_rotation : float
        Rigid azimuthal rotation applied after construction, in degrees.

    """

    def get_grid_type(self) -> str:
        """Return the grid-type identifier string.

        Returns
        -------
        str
            ``"equal_angle"``

        """
        return GridType.EQUAL_ANGLE.value

    def _build_grid(
        self,
    ) -> tuple[pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray]]:
        """Construct the equal-angle hemisphere grid.

        Returns
        -------
        grid : pl.DataFrame
            One row per cell.
        theta_lims : np.ndarray
            Outer theta edge of each band (radians).
        phi_lims : list[np.ndarray]
            Array of phi_min values for each band (identical across bands).
        cell_ids : list[np.ndarray]
            Cell-id arrays, one per band.

        """
        max_theta = np.pi / 2
        theta_edges = np.arange(
            self.angular_resolution_rad / 2,
            max_theta - self.cutoff_theta_rad,
            self.angular_resolution_rad,
        )

        n_phi_divisions = int(2 * np.pi / self.angular_resolution_rad)

        cells = []
        theta_lims = []
        phi_lims = []
        cell_ids = []

        # Zenith
        cells.append(
            pl.DataFrame(
                {
                    "phi": [0.0],
                    "theta": [0.0],
                    "phi_min": [0.0],
                    "phi_max": [2 * np.pi],
                    "theta_min": [0.0],
                    "theta_max": [self.angular_resolution_rad / 2],
                }
            )
        )
        theta_lims.append(self.angular_resolution_rad / 2)
        phi_lims.append(np.array([0.0]))
        cell_ids.append(np.array([0]))
        next_cell_id = 1

        for iband, theta_outer in enumerate(theta_edges[1:]):
            theta_inner = theta_edges[iband]
            phi_span = 2 * np.pi / n_phi_divisions

            cell_id_list = list(range(next_cell_id, next_cell_id + n_phi_divisions))
            next_cell_id = cell_id_list[-1] + 1

            phi_min_arr = np.linspace(0, 2 * np.pi - phi_span, n_phi_divisions)
            phi_max_arr = np.concatenate((phi_min_arr[1:], [2 * np.pi]))

            cells.append(
                pl.DataFrame(
                    {
                        "phi": (phi_min_arr + phi_max_arr) / 2,
                        "theta": np.full(
                            n_phi_divisions,
                            (theta_inner + theta_outer) / 2,
                        ),
                        "phi_min": phi_min_arr,
                        "phi_max": phi_max_arr,
                        "theta_min": np.full(n_phi_divisions, theta_inner),
                        "theta_max": np.full(n_phi_divisions, theta_outer),
                    }
                )
            )

            theta_lims.append(theta_outer)
            phi_lims.append(phi_min_arr)
            cell_ids.append(np.array(cell_id_list))

        grid = pl.concat(cells).with_row_index("cell_id")
        return grid, np.array(theta_lims), phi_lims, cell_ids

get_grid_type()

Return the grid-type identifier string.

Returns

str "equal_angle"

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/equal_angle_grid.py
53
54
55
56
57
58
59
60
61
62
def get_grid_type(self) -> str:
    """Return the grid-type identifier string.

    Returns
    -------
    str
        ``"equal_angle"``

    """
    return GridType.EQUAL_ANGLE.value

EquirectangularBuilder

Bases: BaseGridBuilder

Simple rectangular grid in (theta, phi) space.

The hemisphere is divided into a regular rectangular array: a constant number of theta bands, each containing the same constant number of phi sectors. Every cell is an identical rectangle in angular coordinates. This is structurally identical to EqualAngleBuilder except for one difference in the zenith treatment: EqualAngleBuilder collapses the first band into a single zenith cap, while this builder does not — every band has the same number of sectors.

Because solid angle depends on cos(theta), cells near the zenith subtend more solid angle than cells near the horizon. This makes the grid biased toward the zenith for any solid-angle-weighted statistic. Not recommended for scientific analysis – use EqualAreaBuilder instead.

Coordinate convention (physics / GNSS)

  • phi ∈ [0, 2π) – azimuthal angle from North, clockwise (navigation convention)
  • theta ∈ [0, π/2] – polar angle from zenith (0 = straight up, π/2 = horizon)

What angular_resolution means

angular_resolution (degrees) is used as both the theta-band width and the phi-sector width. The grid is therefore square in angular coordinates::

n_theta = round((π/2 − cutoff) / Δθ)
n_phi   = round(2π / Δθ)
total cells = n_theta × n_phi

Mathematical construction

  1. Theta edges are placed at cutoff_theta, cutoff_theta + Δθ, cutoff_theta + 2Δθ, … up to π/2.
  2. Phi edges are placed at 0, Δθ, 2Δθ, … up to 2π.
  3. Every (theta_band, phi_sector) combination produces one cell. The cell centre is the midpoint of the rectangle.
  4. No special zenith cap is created; the band nearest the zenith has the same number of phi sectors as all other bands.

Parameters

angular_resolution : float Angular spacing in degrees, applied identically in both theta and phi. cutoff_theta : float Elevation mask angle in degrees. Bands whose inner edge is at or below π/2 − cutoff_theta are omitted. phi_rotation : float Rigid azimuthal rotation applied after construction, in degrees.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/equirectangular_grid.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
class EquirectangularBuilder(BaseGridBuilder):
    """Simple rectangular grid in (theta, phi) space.

    The hemisphere is divided into a regular rectangular array: a constant
    number of theta bands, each containing the same constant number of phi
    sectors.  Every cell is an identical rectangle in angular coordinates.
    This is *structurally* identical to ``EqualAngleBuilder`` except for one
    difference in the zenith treatment: ``EqualAngleBuilder`` collapses the
    first band into a single zenith cap, while this builder does not — every
    band has the same number of sectors.

    Because solid angle depends on cos(theta), cells near the zenith subtend
    *more* solid angle than cells near the horizon.  This makes the grid
    biased toward the zenith for any solid-angle-weighted statistic.
    **Not recommended for scientific analysis** – use ``EqualAreaBuilder``
    instead.

    Coordinate convention (physics / GNSS)
    ---------------------------------------
    * phi  ∈ [0, 2π)  – azimuthal angle from North, clockwise (navigation convention)
    * theta ∈ [0, π/2] – polar angle from zenith (0 = straight up,
      π/2 = horizon)

    What ``angular_resolution`` means
    ----------------------------------
    ``angular_resolution`` (degrees) is used as **both** the theta-band width
    *and* the phi-sector width.  The grid is therefore square in angular
    coordinates::

        n_theta = round((π/2 − cutoff) / Δθ)
        n_phi   = round(2π / Δθ)
        total cells = n_theta × n_phi

    Mathematical construction
    -------------------------
    1. Theta edges are placed at ``cutoff_theta``, ``cutoff_theta + Δθ``,
       ``cutoff_theta + 2Δθ``, … up to π/2.
    2. Phi edges are placed at 0, Δθ, 2Δθ, … up to 2π.
    3. Every (theta_band, phi_sector) combination produces one cell.  The
       cell centre is the midpoint of the rectangle.
    4. No special zenith cap is created; the band nearest the zenith has
       the same number of phi sectors as all other bands.

    Parameters
    ----------
    angular_resolution : float
        Angular spacing in degrees, applied identically in both theta and phi.
    cutoff_theta : float
        Elevation mask angle in degrees.  Bands whose *inner* edge is at or
        below ``π/2 − cutoff_theta`` are omitted.
    phi_rotation : float
        Rigid azimuthal rotation applied after construction, in degrees.

    """

    def get_grid_type(self) -> str:
        """Return the grid-type identifier string.

        Returns
        -------
        str
            ``"equirectangular"``

        """
        return GridType.EQUIRECTANGULAR.value

    def _build_grid(
        self,
    ) -> tuple[pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray]]:
        """Construct the equirectangular hemisphere grid.

        Returns
        -------
        grid : pl.DataFrame
            One row per cell with columns: phi, theta, phi_min, phi_max,
            theta_min, theta_max, cell_id.
        theta_lims : np.ndarray
            Inner theta edge of each band (radians).
        phi_lims : list[np.ndarray]
            Array of phi_min values for each band (identical across bands).
        cell_ids : list[np.ndarray]
            Cell-id arrays, one per band.

        """
        max_theta = np.pi / 2

        theta_edges = np.arange(
            self.cutoff_theta_rad,
            max_theta + self.angular_resolution_rad,
            self.angular_resolution_rad,
        )
        phi_edges = np.arange(
            0, 2 * np.pi + self.angular_resolution_rad, self.angular_resolution_rad
        )

        cells = []

        for i in range(len(theta_edges) - 1):
            theta_min, theta_max = theta_edges[i], theta_edges[i + 1]

            for j in range(len(phi_edges) - 1):
                phi_min, phi_max = phi_edges[j], phi_edges[j + 1]

                cells.append(
                    {
                        "phi": (phi_min + phi_max) / 2,
                        "theta": (theta_min + theta_max) / 2,
                        "phi_min": phi_min,
                        "phi_max": min(2 * np.pi, phi_max),
                        "theta_min": theta_min,
                        "theta_max": theta_max,
                    }
                )

        grid = pl.DataFrame(cells).with_columns(
            pl.int_range(0, pl.len()).alias("cell_id")
        )

        theta_lims = theta_edges[:-1]
        phi_lims = [phi_edges[:-1] for _ in range(len(theta_edges) - 1)]
        cell_ids_list = [
            np.arange(i * (len(phi_edges) - 1), (i + 1) * (len(phi_edges) - 1))
            for i in range(len(theta_edges) - 1)
        ]

        return grid, theta_lims, phi_lims, cell_ids_list

get_grid_type()

Return the grid-type identifier string.

Returns

str "equirectangular"

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/equirectangular_grid.py
64
65
66
67
68
69
70
71
72
73
def get_grid_type(self) -> str:
    """Return the grid-type identifier string.

    Returns
    -------
    str
        ``"equirectangular"``

    """
    return GridType.EQUIRECTANGULAR.value

HEALPixBuilder

Bases: BaseGridBuilder

HEALPix tessellation (Hierarchical Equal Area isoLatitude Pixelization).

HEALPix partitions the sphere into 12 base pixels arranged at equal latitudes. Each base pixel is recursively subdivided into 4 children, producing 12 × nside² pixels on the full sphere, all with exactly the same solid angle. This strict equal-area property makes HEALPix the gold standard for pixelisations that must be unbiased under solid-angle weighting.

This builder delegates the pixel geometry entirely to the healpy library. It filters the full-sphere pixelisation down to the northern hemisphere and stores approximate bounding boxes (phi_min/max, theta_min/max) derived from the pixel resolution. The bounding boxes are not the true pixel boundaries (which are curvilinear); they are only approximations suitable for quick spatial queries. For exact pixel membership use healpy.ang2pix directly.

Coordinate convention

HEALPix natively uses colatitude theta ∈ [0, π] (0 = North Pole) and longitude phi ∈ [0, 2π). This matches the GNSS convention used elsewhere in canvodpy: theta = 0 is the zenith, theta = π/2 is the horizon. No coordinate transform is applied.

What nside (resolution) means

nside is the single resolution parameter of HEALPix. It must be a power of 2. The key derived quantities are::

n_pixels   = 12 × nside²           (full sphere)
pixel_area = 4π / n_pixels          (steradians, exact)
resolution ≈ √(pixel_area)          (approximate angular diameter)
           ≈ 58.6° / nside         (degrees)
nside Pixels (full) Approx resolution Pixel area (sr)
1 12 58.6° 1.049
2 48 29.3° 0.262
4 192 14.7° 0.065
8 768 7.3° 0.016
16 3 072 3.7° 0.004
32 12 288 1.8° 0.001

When nside is not provided, it is estimated from angular_resolution and rounded to the nearest power of 2::

nside_estimate = round_to_pow2( √(3/π) × 60 / angular_resolution )

Mathematical construction

HEALPix construction is performed entirely by healpy. At a high level:

  1. The sphere is divided into 12 congruent base pixels (a curvilinear quadrilateral arrangement at three latitude zones: polar caps and equatorial belt).
  2. Each base pixel is subdivided into nside² equal-area children using a hierarchical quadtree.
  3. Pixel centres are returned by healpy.pix2ang(nside, ipix) in RING ordering (pixels ordered by increasing colatitude).
  4. This builder keeps only pixels with theta ≤ π/2 − cutoff_theta (northern hemisphere above the elevation mask).

Parameters

angular_resolution : float Approximate angular resolution in degrees. Used only to derive nside when that parameter is not given explicitly. cutoff_theta : float Elevation mask angle in degrees. Pixels with colatitude theta > π/2 − cutoff_theta (i.e. below the mask) are excluded. nside : int or None HEALPix resolution parameter. Must be a power of 2. If None, estimated from angular_resolution. phi_rotation : float Rigid azimuthal rotation applied after construction, in degrees.

Raises

ImportError If healpy is not installed. ValueError If nside is not a power of 2.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/healpix_grid.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
class HEALPixBuilder(BaseGridBuilder):
    """HEALPix tessellation (Hierarchical Equal Area isoLatitude Pixelization).

    HEALPix partitions the sphere into 12 base pixels arranged at equal
    latitudes.  Each base pixel is recursively subdivided into 4 children,
    producing ``12 × nside²`` pixels on the full sphere, all with *exactly*
    the same solid angle.  This strict equal-area property makes HEALPix
    the gold standard for pixelisations that must be unbiased under
    solid-angle weighting.

    This builder delegates the pixel geometry entirely to the ``healpy``
    library.  It filters the full-sphere pixelisation down to the northern
    hemisphere and stores approximate bounding boxes (``phi_min/max``,
    ``theta_min/max``) derived from the pixel resolution.  The bounding
    boxes are **not** the true pixel boundaries (which are curvilinear);
    they are only approximations suitable for quick spatial queries.  For
    exact pixel membership use ``healpy.ang2pix`` directly.

    Coordinate convention
    ---------------------
    HEALPix natively uses colatitude ``theta ∈ [0, π]`` (0 = North Pole)
    and longitude ``phi ∈ [0, 2π)``.  This matches the GNSS convention used
    elsewhere in canvodpy: theta = 0 is the zenith, theta = π/2 is the
    horizon.  **No coordinate transform is applied.**

    What ``nside`` (resolution) means
    ----------------------------------
    ``nside`` is the single resolution parameter of HEALPix.  It must be a
    power of 2.  The key derived quantities are::

        n_pixels   = 12 × nside²           (full sphere)
        pixel_area = 4π / n_pixels          (steradians, exact)
        resolution ≈ √(pixel_area)          (approximate angular diameter)
                   ≈ 58.6° / nside         (degrees)

    | nside | Pixels (full) | Approx resolution | Pixel area (sr) |
    |-------|---------------|-------------------|-----------------|
    | 1     | 12            | 58.6°             | 1.049           |
    | 2     | 48            | 29.3°             | 0.262           |
    | 4     | 192           | 14.7°             | 0.065           |
    | 8     | 768           | 7.3°              | 0.016           |
    | 16    | 3 072         | 3.7°              | 0.004           |
    | 32    | 12 288        | 1.8°              | 0.001           |

    When ``nside`` is not provided, it is estimated from ``angular_resolution``
    and rounded to the nearest power of 2::

        nside_estimate = round_to_pow2( √(3/π) × 60 / angular_resolution )

    Mathematical construction
    -------------------------
    HEALPix construction is performed entirely by ``healpy``.  At a high
    level:

    1. The sphere is divided into 12 congruent base pixels (a curvilinear
       quadrilateral arrangement at three latitude zones: polar caps and
       equatorial belt).
    2. Each base pixel is subdivided into ``nside²`` equal-area children
       using a hierarchical quadtree.
    3. Pixel centres are returned by ``healpy.pix2ang(nside, ipix)`` in
       RING ordering (pixels ordered by increasing colatitude).
    4. This builder keeps only pixels with ``theta ≤ π/2 − cutoff_theta``
       (northern hemisphere above the elevation mask).

    Parameters
    ----------
    angular_resolution : float
        Approximate angular resolution in degrees.  Used only to derive
        ``nside`` when that parameter is not given explicitly.
    cutoff_theta : float
        Elevation mask angle in degrees.  Pixels with colatitude
        ``theta > π/2 − cutoff_theta`` (i.e. below the mask) are excluded.
    nside : int or None
        HEALPix resolution parameter.  Must be a power of 2.  If ``None``,
        estimated from ``angular_resolution``.
    phi_rotation : float
        Rigid azimuthal rotation applied after construction, in degrees.

    Raises
    ------
    ImportError
        If ``healpy`` is not installed.
    ValueError
        If ``nside`` is not a power of 2.

    """

    def __init__(
        self,
        angular_resolution: float = 2,
        cutoff_theta: float = 0,
        nside: int | None = None,
        phi_rotation: float = 0,
    ) -> None:
        """Initialize the HEALPix grid builder.

        Parameters
        ----------
        angular_resolution : float, default 2
            Angular resolution in degrees.
        cutoff_theta : float, default 0
            Maximum polar angle cutoff in degrees.
        nside : int | None, optional
            HEALPix nside parameter.
        phi_rotation : float, default 0
            Rotation angle in degrees.

        """
        super().__init__(angular_resolution, cutoff_theta, phi_rotation)

        # Determine nside
        if nside is None:
            nside_estimate = int(np.sqrt(3 / np.pi) * 60 / angular_resolution)
            self.nside = 2 ** max(0, int(np.round(np.log2(nside_estimate))))
        else:
            if nside < 1 or (nside & (nside - 1)) != 0:
                raise ValueError(f"nside must be a power of 2, got {nside}")
            self.nside = nside

        # Import healpy
        try:
            import healpy as hp

            self.hp = hp
        except ImportError:
            raise ImportError(
                "healpy is required for HEALPix grid. Install with: pip install healpy"
            )

        pixel_size_arcmin = self.hp.nside2resol(self.nside, arcmin=True)
        self.actual_angular_resolution = pixel_size_arcmin / 60.0

        self._logger.info(
            f"HEALPix: nside={self.nside}, "
            f"requested_res={angular_resolution:.2f}°, "
            f"actual_res={self.actual_angular_resolution:.2f}°"
        )

    def get_grid_type(self) -> str:
        """Return the grid-type identifier string.

        Returns
        -------
        str
            ``"healpix"``

        """
        return GridType.HEALPIX.value

    def _build_grid(
        self,
    ) -> tuple[pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray]]:
        """Build HEALPix grid for the northern hemisphere.

        Iterates over all ``12 × nside²`` pixels, retains those with
        ``theta ≤ π/2 − cutoff_theta``, and constructs approximate
        bounding boxes from the pixel resolution.

        Returns
        -------
        grid : pl.DataFrame
            One row per pixel.  Contains phi, theta (centre), approximate
            bounding-box limits, ``healpix_ipix`` (RING-ordered pixel index),
            and ``healpix_nside``.
        theta_lims : np.ndarray
            Synthetic evenly-spaced theta limits (interface compatibility only).
        phi_lims : list[np.ndarray]
            Synthetic evenly-spaced phi limits (interface compatibility only).
        cell_ids : list[np.ndarray]
            Single-element list containing the valid pixel indices.

        """
        npix = self.hp.nside2npix(self.nside)

        cells = []
        valid_pixels = []

        for ipix in range(npix):
            theta, phi = self.hp.pix2ang(self.nside, ipix)

            # Keep only northern hemisphere above the elevation mask
            if theta > (np.pi / 2 - self.cutoff_theta_rad):
                continue

            pixel_radius = self.hp.nside2resol(self.nside)

            cells.append(
                {
                    "phi": float(phi),
                    "theta": float(theta),
                    "phi_min": float(max(0, phi - pixel_radius / 2)),
                    "phi_max": float(min(2 * np.pi, phi + pixel_radius / 2)),
                    "theta_min": float(max(0, theta - pixel_radius / 2)),
                    "theta_max": float(min(np.pi / 2, theta + pixel_radius / 2)),
                    "healpix_ipix": int(ipix),
                    "healpix_nside": int(self.nside),
                }
            )
            valid_pixels.append(int(ipix))

        if len(cells) == 0:
            raise ValueError("No valid HEALPix pixels found in hemisphere")

        grid = pl.DataFrame(cells)

        grid = grid.with_columns(
            [
                pl.col("healpix_ipix").cast(pl.Int64),
                pl.col("healpix_nside").cast(pl.Int64),
            ]
        )

        theta_unique = sorted(grid["theta"].unique())
        n_theta_bands = len(theta_unique)

        # NOTE: These limits are SYNTHETIC and do NOT correspond to actual
        # HEALPix pixel boundaries. They exist only for interface
        # compatibility with ring-based grids. For spatial queries, use the
        # per-pixel theta_min/max and phi_min/max columns instead.
        theta_lims = np.linspace(0, np.pi / 2, min(n_theta_bands, 20))
        phi_lims = [np.linspace(0, 2 * np.pi, 20) for _ in range(len(theta_lims))]

        cell_ids_list = [np.array(valid_pixels, dtype=np.int64)]

        return grid, theta_lims, phi_lims, cell_ids_list

    def get_healpix_info(self) -> dict:
        """Get HEALPix-specific information.

        Returns
        -------
        info : dict
            Keys: ``nside``, ``npix_total``, ``pixel_area_sr``,
            ``pixel_area_arcmin2``, ``resolution_arcmin``,
            ``resolution_deg``, ``max_pixel_radius_deg``.

        """
        return {
            "nside": self.nside,
            "npix_total": self.hp.nside2npix(self.nside),
            "pixel_area_sr": self.hp.nside2pixarea(self.nside),
            "pixel_area_arcmin2": (
                self.hp.nside2pixarea(self.nside, degrees=True) * 3600
            ),
            "resolution_arcmin": self.hp.nside2resol(self.nside, arcmin=True),
            "resolution_deg": self.actual_angular_resolution,
            "max_pixel_radius_deg": np.rad2deg(self.hp.max_pixrad(self.nside)),
        }

__init__(angular_resolution=2, cutoff_theta=0, nside=None, phi_rotation=0)

Initialize the HEALPix grid builder.

Parameters

angular_resolution : float, default 2 Angular resolution in degrees. cutoff_theta : float, default 0 Maximum polar angle cutoff in degrees. nside : int | None, optional HEALPix nside parameter. phi_rotation : float, default 0 Rotation angle in degrees.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/healpix_grid.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def __init__(
    self,
    angular_resolution: float = 2,
    cutoff_theta: float = 0,
    nside: int | None = None,
    phi_rotation: float = 0,
) -> None:
    """Initialize the HEALPix grid builder.

    Parameters
    ----------
    angular_resolution : float, default 2
        Angular resolution in degrees.
    cutoff_theta : float, default 0
        Maximum polar angle cutoff in degrees.
    nside : int | None, optional
        HEALPix nside parameter.
    phi_rotation : float, default 0
        Rotation angle in degrees.

    """
    super().__init__(angular_resolution, cutoff_theta, phi_rotation)

    # Determine nside
    if nside is None:
        nside_estimate = int(np.sqrt(3 / np.pi) * 60 / angular_resolution)
        self.nside = 2 ** max(0, int(np.round(np.log2(nside_estimate))))
    else:
        if nside < 1 or (nside & (nside - 1)) != 0:
            raise ValueError(f"nside must be a power of 2, got {nside}")
        self.nside = nside

    # Import healpy
    try:
        import healpy as hp

        self.hp = hp
    except ImportError:
        raise ImportError(
            "healpy is required for HEALPix grid. Install with: pip install healpy"
        )

    pixel_size_arcmin = self.hp.nside2resol(self.nside, arcmin=True)
    self.actual_angular_resolution = pixel_size_arcmin / 60.0

    self._logger.info(
        f"HEALPix: nside={self.nside}, "
        f"requested_res={angular_resolution:.2f}°, "
        f"actual_res={self.actual_angular_resolution:.2f}°"
    )

get_grid_type()

Return the grid-type identifier string.

Returns

str "healpix"

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/healpix_grid.py
147
148
149
150
151
152
153
154
155
156
def get_grid_type(self) -> str:
    """Return the grid-type identifier string.

    Returns
    -------
    str
        ``"healpix"``

    """
    return GridType.HEALPIX.value

get_healpix_info()

Get HEALPix-specific information.

Returns

info : dict Keys: nside, npix_total, pixel_area_sr, pixel_area_arcmin2, resolution_arcmin, resolution_deg, max_pixel_radius_deg.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/healpix_grid.py
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
def get_healpix_info(self) -> dict:
    """Get HEALPix-specific information.

    Returns
    -------
    info : dict
        Keys: ``nside``, ``npix_total``, ``pixel_area_sr``,
        ``pixel_area_arcmin2``, ``resolution_arcmin``,
        ``resolution_deg``, ``max_pixel_radius_deg``.

    """
    return {
        "nside": self.nside,
        "npix_total": self.hp.nside2npix(self.nside),
        "pixel_area_sr": self.hp.nside2pixarea(self.nside),
        "pixel_area_arcmin2": (
            self.hp.nside2pixarea(self.nside, degrees=True) * 3600
        ),
        "resolution_arcmin": self.hp.nside2resol(self.nside, arcmin=True),
        "resolution_deg": self.actual_angular_resolution,
        "max_pixel_radius_deg": np.rad2deg(self.hp.max_pixrad(self.nside)),
    }

GeodesicBuilder

Bases: BaseGridBuilder

Geodesic grid based on a subdivided icosahedron.

The sphere is tessellated into triangular cells by starting with an icosahedron (20 equilateral triangles) and recursively subdividing each triangle into four smaller triangles. All vertices are projected back onto the unit sphere after each subdivision step, so the final cells are spherical triangles. The grid has no polar singularity and provides near-uniform cell areas, though strict equal-area is not guaranteed — cell areas vary by a few percent depending on how they inherit the icosahedral symmetry axes.

Coordinate convention (physics / GNSS)

  • phi ∈ [0, 2π) – azimuthal angle from North, clockwise (navigation convention)
  • theta ∈ [0, π/2] – polar angle from zenith (0 = straight up, π/2 = horizon)

Cell centres are computed as the 3D Cartesian mean of the three vertices, re-normalised onto the unit sphere.

What angular_resolution means

angular_resolution is not used directly as a cell size. Instead it is used only when subdivision_level is not explicitly supplied, to estimate an appropriate subdivision level. The heuristic targets an approximate triangle edge length of 2 × angular_resolution::

target_edge ≈ 2 × angular_resolution   (degrees)
subdivision_level = ceil(log₂(63.4 / target_edge))

The number 63.4° is the edge length of a regular icosahedron inscribed in a unit sphere. Each subdivision halves the edge length, so the actual edge length at level n is approximately::

edge ≈ 63.4° / 2ⁿ   (degrees)

The total number of triangles on the full sphere is 20 × 4ⁿ. Roughly half fall in the northern hemisphere (exact count depends on the hemisphere boundary).

Mathematical construction

  1. Icosahedron – 12 vertices placed at the intersections of three mutually perpendicular golden-ratio rectangles, normalised to the unit sphere. 20 triangular faces connect them.
  2. Subdivision – each triangle is split into 4 by inserting edge midpoints. Each midpoint is projected onto the unit sphere (re-normalised) before the next subdivision. This is repeated subdivision_level times.
  3. Hemisphere filter – faces are kept if any of their three vertices satisfies theta ≤ π/2 − cutoff_theta. Consequently, boundary triangles that straddle the horizon are included and extend slightly below it.
  4. Phi wrapping – for triangles that straddle the 0/2π azimuthal boundary, vertex phis below π are shifted by +2π before computing bounding-box limits, then wrapped back.

Parameters

angular_resolution : float Approximate angular resolution in degrees. Used only to derive subdivision_level when that parameter is not given explicitly. cutoff_theta : float Elevation mask angle in degrees. Triangles are excluded only if all their vertices are below this elevation. subdivision_level : int or None Number of icosahedral subdivisions. If None, estimated from angular_resolution. Typical range 0–5. phi_rotation : float Rigid azimuthal rotation applied after construction, in degrees.

Notes

The theta_lims, phi_lims, and cell_ids fields of the returned GridData are synthetic evenly-spaced arrays kept only for interface compatibility with ring-based grids. They do not describe the actual triangular cell layout. Use the geodesic_vertices column and the vertices array in GridData.vertices for the true geometry.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/geodesic_grid.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
class GeodesicBuilder(BaseGridBuilder):
    """Geodesic grid based on a subdivided icosahedron.

    The sphere is tessellated into triangular cells by starting with an
    icosahedron (20 equilateral triangles) and recursively subdividing each
    triangle into four smaller triangles.  All vertices are projected back
    onto the unit sphere after each subdivision step, so the final cells are
    *spherical* triangles.  The grid has no polar singularity and provides
    near-uniform cell areas, though strict equal-area is *not* guaranteed —
    cell areas vary by a few percent depending on how they inherit the
    icosahedral symmetry axes.

    Coordinate convention (physics / GNSS)
    ---------------------------------------
    * phi  ∈ [0, 2π)  – azimuthal angle from North, clockwise (navigation convention)
    * theta ∈ [0, π/2] – polar angle from zenith (0 = straight up,
      π/2 = horizon)

    Cell centres are computed as the 3D Cartesian mean of the three vertices,
    re-normalised onto the unit sphere.

    What ``angular_resolution`` means
    ----------------------------------
    ``angular_resolution`` is **not** used directly as a cell size.  Instead it
    is used only when ``subdivision_level`` is *not* explicitly supplied, to
    *estimate* an appropriate subdivision level.  The heuristic targets an
    approximate triangle edge length of ``2 × angular_resolution``::

        target_edge ≈ 2 × angular_resolution   (degrees)
        subdivision_level = ceil(log₂(63.4 / target_edge))

    The number 63.4° is the edge length of a regular icosahedron inscribed in
    a unit sphere.  Each subdivision halves the edge length, so the actual
    edge length at level *n* is approximately::

        edge ≈ 63.4° / 2ⁿ   (degrees)

    The total number of triangles on the **full sphere** is ``20 × 4ⁿ``.
    Roughly half fall in the northern hemisphere (exact count depends on
    the hemisphere boundary).

    Mathematical construction
    -------------------------
    1. **Icosahedron** – 12 vertices placed at the intersections of three
       mutually perpendicular golden-ratio rectangles, normalised to the
       unit sphere.  20 triangular faces connect them.
    2. **Subdivision** – each triangle is split into 4 by inserting edge
       midpoints.  Each midpoint is projected onto the unit sphere
       (re-normalised) before the next subdivision.  This is repeated
       ``subdivision_level`` times.
    3. **Hemisphere filter** – faces are kept if *any* of their three
       vertices satisfies ``theta ≤ π/2 − cutoff_theta``.  Consequently,
       boundary triangles that straddle the horizon *are* included and
       extend slightly below it.
    4. **Phi wrapping** – for triangles that straddle the 0/2π azimuthal
       boundary, vertex phis below π are shifted by +2π before computing
       bounding-box limits, then wrapped back.

    Parameters
    ----------
    angular_resolution : float
        Approximate angular resolution in degrees.  Used only to derive
        ``subdivision_level`` when that parameter is not given explicitly.
    cutoff_theta : float
        Elevation mask angle in degrees.  Triangles are excluded only if
        *all* their vertices are below this elevation.
    subdivision_level : int or None
        Number of icosahedral subdivisions.  If ``None``, estimated from
        ``angular_resolution``.  Typical range 0–5.
    phi_rotation : float
        Rigid azimuthal rotation applied after construction, in degrees.

    Notes
    -----
    The ``theta_lims``, ``phi_lims``, and ``cell_ids`` fields of the returned
    ``GridData`` are *synthetic* evenly-spaced arrays kept only for interface
    compatibility with ring-based grids.  They do **not** describe the actual
    triangular cell layout.  Use the ``geodesic_vertices`` column and the
    ``vertices`` array in ``GridData.vertices`` for the true geometry.

    """

    def __init__(
        self,
        angular_resolution: float = 2,
        cutoff_theta: float = 0,
        subdivision_level: int | None = None,
        phi_rotation: float = 0,
    ) -> None:
        """Initialize the geodesic grid builder.

        Parameters
        ----------
        angular_resolution : float, default 2
            Angular resolution in degrees.
        cutoff_theta : float, default 0
            Maximum polar angle cutoff in degrees.
        subdivision_level : int | None, optional
            Subdivision level override.
        phi_rotation : float, default 0
            Rotation angle in degrees.

        """
        super().__init__(angular_resolution, cutoff_theta, phi_rotation)
        self._triangles: np.ndarray | None = None

        if subdivision_level is None:
            target_edge_deg = angular_resolution * 2
            self.subdivision_level = max(
                0,
                int(np.ceil(np.log2(63.4 / target_edge_deg))),
            )
        else:
            self.subdivision_level = subdivision_level

        self._logger.info(
            f"Geodesic: subdivision_level={self.subdivision_level}, "
            f"~{20 * 4**self.subdivision_level} triangles"
        )

    def get_triangles(self) -> np.ndarray | None:
        """Return triangle vertex coordinates for visualization.

        Returns
        -------
        triangles : np.ndarray or None
            Array of shape ``(n_faces, 3, 3)`` where ``triangles[i]`` contains
            the three 3D unit-sphere vertices of triangle *i*.  ``None`` if
            the grid has not been built yet.

        """
        return self._triangles

    def get_grid_type(self) -> str:
        """Return the grid-type identifier string.

        Returns
        -------
        str
            ``"geodesic"``

        """
        return GridType.GEODESIC.value

    def _extract_triangle_vertices(
        self, vertices: np.ndarray, faces: np.ndarray
    ) -> np.ndarray:
        """Extract 3D vertex coordinates for each face.

        Parameters
        ----------
        vertices : np.ndarray
            All sphere vertices, shape ``(n_vertices, 3)``.
        faces : np.ndarray
            Face index array, shape ``(n_faces, 3)``.

        Returns
        -------
        triangles : np.ndarray
            Shape ``(n_faces, 3, 3)`` – three 3D vertices per face.

        """
        # Vectorized: use NumPy advanced indexing instead of loop
        return vertices[faces]

    def _build_grid(
        self,
    ) -> tuple[
        pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray], dict[str, Any]
    ]:
        """Build geodesic grid from subdivided icosahedron.

        Returns
        -------
        grid : pl.DataFrame
            One row per triangular cell.  Columns include phi, theta (centre),
            bounding-box limits, ``geodesic_vertices`` (3 vertex indices into
            the ``vertices`` array), and ``geodesic_subdivision``.
        theta_lims : np.ndarray
            Synthetic evenly-spaced theta limits (interface compatibility only).
        phi_lims : list[np.ndarray]
            Synthetic evenly-spaced phi limits (interface compatibility only).
        cell_ids : list[np.ndarray]
            Single-element list containing all cell ids.
        extra_kwargs : dict
            Contains ``vertices`` (shape ``(n_vertices, 3)``),
            ``vertex_phi``, and ``vertex_theta`` arrays for the full
            subdivided icosahedron.

        """
        vertices, faces = self._create_icosahedron()

        # Subdivide
        for _ in range(self.subdivision_level):
            vertices, faces = self._subdivide_mesh(vertices, faces)

        # Project to unit sphere
        vertices = vertices / np.linalg.norm(vertices, axis=1, keepdims=True)

        # Convert to spherical
        x, y, z = vertices[:, 0], vertices[:, 1], vertices[:, 2]
        theta = np.arccos(np.clip(z, -1, 1))
        phi = np.arctan2(y, x)
        phi = np.mod(phi, 2 * np.pi)

        # Filter to northern hemisphere
        hemisphere_mask = theta <= (np.pi / 2 - self.cutoff_theta_rad)

        # Filter faces
        valid_faces = []
        for face in faces:
            if any(hemisphere_mask[v] for v in face):
                valid_faces.append(face)

        if len(valid_faces) == 0:
            raise ValueError("No valid faces in hemisphere")

        valid_faces = np.array(valid_faces)

        # Create cells
        cells = []
        for face in valid_faces:
            v_indices = face
            face_phi = phi[v_indices]
            face_theta = theta[v_indices]

            # Handle phi wrapping for triangles crossing 0/2π boundary
            phi_range = np.ptp(face_phi)
            if phi_range > np.pi:
                # Triangle crosses the wraparound - unwrap relative to median
                ref_phi = np.median(face_phi)
                face_phi_unwrapped = face_phi.copy()
                # Unwrap angles that are > π away from reference
                mask_low = (ref_phi - face_phi_unwrapped) > np.pi
                mask_high = (face_phi_unwrapped - ref_phi) > np.pi
                face_phi_unwrapped[mask_low] += 2 * np.pi
                face_phi_unwrapped[mask_high] -= 2 * np.pi
                phi_min = float(np.min(face_phi_unwrapped) % (2 * np.pi))
                phi_max = float(np.max(face_phi_unwrapped) % (2 * np.pi))
            else:
                phi_min = float(np.min(face_phi))
                phi_max = float(np.max(face_phi))

            # Cell center - 3D Cartesian mean
            face_vertices_3d = vertices[v_indices]
            center_3d = np.mean(face_vertices_3d, axis=0)
            center_3d = center_3d / np.linalg.norm(center_3d)

            center_theta = np.arccos(np.clip(center_3d[2], -1, 1))
            center_phi = np.arctan2(center_3d[1], center_3d[0])
            center_phi = np.mod(center_phi, 2 * np.pi)

            # Cell bounds (theta from vertices, phi already computed above)
            theta_min = float(np.min(face_theta))
            theta_max = float(np.max(face_theta))

            cells.append(
                {
                    "phi": center_phi,
                    "theta": center_theta,
                    "phi_min": phi_min,
                    "phi_max": phi_max,
                    "theta_min": theta_min,
                    "theta_max": theta_max,
                    "geodesic_vertices": v_indices.tolist(),
                    "geodesic_subdivision": self.subdivision_level,
                }
            )

        grid = pl.DataFrame(cells).with_columns(
            pl.int_range(0, pl.len()).alias("cell_id")
        )

        extra_kwargs: dict[str, Any] = {
            "vertices": vertices,
            "vertex_phi": phi,
            "vertex_theta": theta,
        }

        theta_lims = np.linspace(0, np.pi / 2, 10)
        phi_lims = [np.linspace(0, 2 * np.pi, 20) for _ in range(len(theta_lims))]
        cell_ids_list = [np.arange(grid.height)]

        self._triangles = self._extract_triangle_vertices(vertices, faces)

        return grid, theta_lims, phi_lims, cell_ids_list, extra_kwargs

    def _create_icosahedron(self) -> tuple[np.ndarray, np.ndarray]:
        """Create a unit-sphere icosahedron.

        Returns
        -------
        vertices : np.ndarray
            Shape ``(12, 3)`` – vertices on the unit sphere.
        faces : np.ndarray
            Shape ``(20, 3)`` – integer vertex indices per triangular face.

        """
        phi_golden = (1 + np.sqrt(5)) / 2

        vertices = np.array(
            [
                [-1, phi_golden, 0],
                [1, phi_golden, 0],
                [-1, -phi_golden, 0],
                [1, -phi_golden, 0],
                [0, -1, phi_golden],
                [0, 1, phi_golden],
                [0, -1, -phi_golden],
                [0, 1, -phi_golden],
                [phi_golden, 0, -1],
                [phi_golden, 0, 1],
                [-phi_golden, 0, -1],
                [-phi_golden, 0, 1],
            ],
            dtype=np.float64,
        )

        vertices = vertices / np.linalg.norm(vertices, axis=1, keepdims=True)

        faces = np.array(
            [
                [0, 11, 5],
                [0, 5, 1],
                [0, 1, 7],
                [0, 7, 10],
                [0, 10, 11],
                [1, 5, 9],
                [5, 11, 4],
                [11, 10, 2],
                [10, 7, 6],
                [7, 1, 8],
                [3, 9, 4],
                [3, 4, 2],
                [3, 2, 6],
                [3, 6, 8],
                [3, 8, 9],
                [4, 9, 5],
                [2, 4, 11],
                [6, 2, 10],
                [8, 6, 7],
                [9, 8, 1],
            ],
            dtype=np.int64,
        )

        return vertices, faces

    def _subdivide_mesh(
        self, vertices: np.ndarray, faces: np.ndarray
    ) -> tuple[np.ndarray, np.ndarray]:
        """Subdivide each triangle into 4 smaller triangles.

        Each edge midpoint is computed, normalised onto the unit sphere, and
        cached so that shared edges produce only one new vertex.

        Parameters
        ----------
        vertices : np.ndarray
            Current vertex array, shape ``(n_vertices, 3)``.
        faces : np.ndarray
            Current face array, shape ``(n_faces, 3)``.

        Returns
        -------
        new_vertices : np.ndarray
            Expanded vertex array, shape ``(n_vertices + n_new_midpoints, 3)``.
        new_faces : np.ndarray
            New face array, shape ``(4 × n_faces, 3)``.

        """
        new_faces = []
        edge_midpoints: dict[tuple[int, int], int] = {}

        def get_midpoint(v1: int, v2: int) -> int:
            """Return midpoint vertex index for an edge.

            Parameters
            ----------
            v1 : int
                First vertex index.
            v2 : int
                Second vertex index.

            Returns
            -------
            int
                Index of the midpoint vertex.

            """
            edge = tuple(sorted([v1, v2]))
            if edge not in edge_midpoints:
                edge_midpoints[edge] = len(vertices) + len(edge_midpoints)
            return edge_midpoints[edge]

        for face in faces:
            v0, v1, v2 = face

            m01 = get_midpoint(v0, v1)
            m12 = get_midpoint(v1, v2)
            m20 = get_midpoint(v2, v0)

            new_faces.extend(
                [
                    [v0, m01, m20],
                    [v1, m12, m01],
                    [v2, m20, m12],
                    [m01, m12, m20],
                ]
            )

        n_original = len(vertices)
        n_new = len(edge_midpoints)
        final_vertices = np.zeros((n_original + n_new, 3))
        final_vertices[:n_original] = vertices

        for edge, idx in edge_midpoints.items():
            v1, v2 = edge
            midpoint = (vertices[v1] + vertices[v2]) / 2
            midpoint = midpoint / np.linalg.norm(midpoint)
            final_vertices[idx] = midpoint

        return final_vertices, np.array(new_faces)

__init__(angular_resolution=2, cutoff_theta=0, subdivision_level=None, phi_rotation=0)

Initialize the geodesic grid builder.

Parameters

angular_resolution : float, default 2 Angular resolution in degrees. cutoff_theta : float, default 0 Maximum polar angle cutoff in degrees. subdivision_level : int | None, optional Subdivision level override. phi_rotation : float, default 0 Rotation angle in degrees.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/geodesic_grid.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def __init__(
    self,
    angular_resolution: float = 2,
    cutoff_theta: float = 0,
    subdivision_level: int | None = None,
    phi_rotation: float = 0,
) -> None:
    """Initialize the geodesic grid builder.

    Parameters
    ----------
    angular_resolution : float, default 2
        Angular resolution in degrees.
    cutoff_theta : float, default 0
        Maximum polar angle cutoff in degrees.
    subdivision_level : int | None, optional
        Subdivision level override.
    phi_rotation : float, default 0
        Rotation angle in degrees.

    """
    super().__init__(angular_resolution, cutoff_theta, phi_rotation)
    self._triangles: np.ndarray | None = None

    if subdivision_level is None:
        target_edge_deg = angular_resolution * 2
        self.subdivision_level = max(
            0,
            int(np.ceil(np.log2(63.4 / target_edge_deg))),
        )
    else:
        self.subdivision_level = subdivision_level

    self._logger.info(
        f"Geodesic: subdivision_level={self.subdivision_level}, "
        f"~{20 * 4**self.subdivision_level} triangles"
    )

get_triangles()

Return triangle vertex coordinates for visualization.

Returns

triangles : np.ndarray or None Array of shape (n_faces, 3, 3) where triangles[i] contains the three 3D unit-sphere vertices of triangle i. None if the grid has not been built yet.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/geodesic_grid.py
131
132
133
134
135
136
137
138
139
140
141
142
def get_triangles(self) -> np.ndarray | None:
    """Return triangle vertex coordinates for visualization.

    Returns
    -------
    triangles : np.ndarray or None
        Array of shape ``(n_faces, 3, 3)`` where ``triangles[i]`` contains
        the three 3D unit-sphere vertices of triangle *i*.  ``None`` if
        the grid has not been built yet.

    """
    return self._triangles

get_grid_type()

Return the grid-type identifier string.

Returns

str "geodesic"

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/geodesic_grid.py
144
145
146
147
148
149
150
151
152
153
def get_grid_type(self) -> str:
    """Return the grid-type identifier string.

    Returns
    -------
    str
        ``"geodesic"``

    """
    return GridType.GEODESIC.value

FibonacciBuilder

Bases: BaseGridBuilder

Fibonacci sphere grid with spherical Voronoi tessellation.

Points are distributed on the sphere using the Fibonacci lattice (golden-spiral method), which provides one of the most uniform point distributions achievable on a sphere without iterative optimisation. Each point then becomes the centre of a spherical Voronoi cell — the region of the sphere closer to that point than to any other. The resulting tessellation has no polar singularities and near-uniform cell areas.

The tessellation is computed by scipy.spatial.SphericalVoronoi. Because Voronoi cells have curvilinear boundaries, the phi_min/max and theta_min/max columns in the grid are axis-aligned bounding boxes, not the true cell boundaries. They are unreliable for spatial queries — use the voronoi_region column (vertex indices into the SphericalVoronoi.vertices array) for exact geometry.

Coordinate convention (physics / GNSS)

  • phi ∈ [0, 2π) – azimuthal angle from North, clockwise (navigation convention)
  • theta ∈ [0, π/2] – polar angle from zenith (0 = straight up, π/2 = horizon)

What n_points (resolution) means

Resolution is controlled by n_points, the number of Voronoi cells in the hemisphere. When n_points is not supplied it is estimated from angular_resolution via::

cell_area  ≈ angular_resolution²   (radians²)
n_points   = max(10, round(2π / cell_area))

The approximate cell "diameter" (assuming a circular cell of equal area) is::

d ≈ 2 √(2π / n_points)   (radians)
  ≈ 2 × angular_resolution

angular_resolution therefore has no direct geometric meaning for this grid type — it is only a convenience for the n_points estimator.

Mathematical construction

  1. Full-sphere Fibonacci lattice2 × n_points points are generated on the unit sphere. Point i has::

    θᵢ = arccos(1 − 2(i + 0.5) / N) φᵢ = 2π (i + 0.5) / φ_golden (mod 2π)

where N = 2 × n_points and φ_golden = (1+√5)/2. The +0.5 offset avoids placing points exactly at the poles. 2. Hemisphere filter – points with θ > π/2 − cutoff_theta are discarded. 3. Spherical Voronoi tessellationscipy.spatial.SphericalVoronoi computes the Voronoi diagram on the unit sphere. Regions are sorted so that vertices appear in counter-clockwise order around each cell. 4. Bounding boxes – axis-aligned bounding boxes in (phi, theta) are computed from the Voronoi vertex coordinates. These are approximations only (see caveat above).

Parameters

angular_resolution : float Approximate angular resolution in degrees. Used only to estimate n_points when that parameter is not given explicitly. cutoff_theta : float Elevation mask angle in degrees. Points below this elevation are excluded before tessellation. n_points : int or None Target number of Voronoi cells in the hemisphere. If None, estimated from angular_resolution. phi_rotation : float Rigid azimuthal rotation applied after construction, in degrees.

Raises

ImportError If scipy is not installed. ValueError If fewer than 4 points survive the hemisphere filter.

Notes

The theta_lims, phi_lims, and cell_ids fields of the returned GridData are synthetic evenly-spaced arrays kept only for interface compatibility with ring-based grids. They do not describe the actual Voronoi cell layout.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/fibonacci_grid.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
class FibonacciBuilder(BaseGridBuilder):
    """Fibonacci sphere grid with spherical Voronoi tessellation.

    Points are distributed on the sphere using the *Fibonacci lattice*
    (golden-spiral method), which provides one of the most uniform
    point distributions achievable on a sphere without iterative
    optimisation.  Each point then becomes the centre of a *spherical
    Voronoi cell* — the region of the sphere closer to that point than
    to any other.  The resulting tessellation has no polar singularities
    and near-uniform cell areas.

    The tessellation is computed by ``scipy.spatial.SphericalVoronoi``.
    Because Voronoi cells have curvilinear boundaries, the ``phi_min/max``
    and ``theta_min/max`` columns in the grid are axis-aligned *bounding
    boxes*, **not** the true cell boundaries.  They are unreliable for
    spatial queries — use the ``voronoi_region`` column (vertex indices
    into the ``SphericalVoronoi.vertices`` array) for exact geometry.

    Coordinate convention (physics / GNSS)
    ---------------------------------------
    * phi  ∈ [0, 2π)  – azimuthal angle from North, clockwise (navigation convention)
    * theta ∈ [0, π/2] – polar angle from zenith (0 = straight up,
      π/2 = horizon)

    What ``n_points`` (resolution) means
    -------------------------------------
    Resolution is controlled by ``n_points``, the number of Voronoi cells
    in the hemisphere.  When ``n_points`` is not supplied it is estimated
    from ``angular_resolution`` via::

        cell_area  ≈ angular_resolution²   (radians²)
        n_points   = max(10, round(2π / cell_area))

    The approximate cell "diameter" (assuming a circular cell of equal area)
    is::

        d ≈ 2 √(2π / n_points)   (radians)
          ≈ 2 × angular_resolution

    ``angular_resolution`` therefore has **no direct geometric meaning** for
    this grid type — it is only a convenience for the ``n_points`` estimator.

    Mathematical construction
    -------------------------
    1. **Full-sphere Fibonacci lattice** – ``2 × n_points`` points are
       generated on the unit sphere.  Point *i* has::

           θᵢ = arccos(1 − 2(i + 0.5) / N)
           φᵢ = 2π (i + 0.5) / φ_golden   (mod 2π)

       where ``N = 2 × n_points`` and ``φ_golden = (1+√5)/2``.  The
       ``+0.5`` offset avoids placing points exactly at the poles.
    2. **Hemisphere filter** – points with ``θ > π/2 − cutoff_theta``
       are discarded.
    3. **Spherical Voronoi tessellation** –
       ``scipy.spatial.SphericalVoronoi`` computes the Voronoi diagram
       on the unit sphere.  Regions are sorted so that vertices appear
       in counter-clockwise order around each cell.
    4. **Bounding boxes** – axis-aligned bounding boxes in (phi, theta)
       are computed from the Voronoi vertex coordinates.  These are
       approximations only (see caveat above).

    Parameters
    ----------
    angular_resolution : float
        Approximate angular resolution in degrees.  Used only to estimate
        ``n_points`` when that parameter is not given explicitly.
    cutoff_theta : float
        Elevation mask angle in degrees.  Points below this elevation are
        excluded before tessellation.
    n_points : int or None
        Target number of Voronoi cells in the hemisphere.  If ``None``,
        estimated from ``angular_resolution``.
    phi_rotation : float
        Rigid azimuthal rotation applied after construction, in degrees.

    Raises
    ------
    ImportError
        If ``scipy`` is not installed.
    ValueError
        If fewer than 4 points survive the hemisphere filter.

    Notes
    -----
    The ``theta_lims``, ``phi_lims``, and ``cell_ids`` fields of the returned
    ``GridData`` are *synthetic* evenly-spaced arrays kept only for interface
    compatibility with ring-based grids.  They do **not** describe the actual
    Voronoi cell layout.

    """

    def __init__(
        self,
        angular_resolution: float = 2,
        cutoff_theta: float = 0,
        n_points: int | None = None,
        phi_rotation: float = 0,
    ) -> None:
        """Initialize the Fibonacci grid builder.

        Parameters
        ----------
        angular_resolution : float, default 2
            Angular resolution in degrees.
        cutoff_theta : float, default 0
            Maximum polar angle cutoff in degrees.
        n_points : int | None, optional
            Number of points to generate.
        phi_rotation : float, default 0
            Rotation angle in degrees.

        """
        super().__init__(angular_resolution, cutoff_theta, phi_rotation)

        if n_points is None:
            cell_area = self.angular_resolution_rad**2
            hemisphere_area = 2 * np.pi
            self.n_points = max(10, int(hemisphere_area / cell_area))
        else:
            self.n_points = n_points

        self._logger.info(f"Fibonacci: generating {self.n_points} points")

    def get_grid_type(self) -> str:
        """Return the grid-type identifier string.

        Returns
        -------
        str
            ``"fibonacci"``

        """
        return GridType.FIBONACCI.value

    def _build_grid(
        self,
    ) -> tuple[
        pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray], dict[str, Any]
    ]:
        """Build Fibonacci sphere grid with Voronoi tessellation.

        Returns
        -------
        grid : pl.DataFrame
            One row per Voronoi cell.  Contains phi, theta (centre),
            bounding-box limits, ``voronoi_region`` (list of vertex indices
            into the Voronoi vertex array), and ``n_vertices``.
        theta_lims : np.ndarray
            Synthetic evenly-spaced theta limits (interface compatibility only).
        phi_lims : list[np.ndarray]
            Synthetic evenly-spaced phi limits (interface compatibility only).
        cell_ids : list[np.ndarray]
            Single-element list containing all cell ids.
        extra_kwargs : dict
            Contains ``voronoi`` (the ``SphericalVoronoi`` object) and
            ``points_xyz`` (the hemisphere point cloud, shape
            ``(n_points, 3)``).

        """
        points_xyz = self._generate_fibonacci_sphere(self.n_points * 2)

        # Convert to spherical
        x, y, z = points_xyz[:, 0], points_xyz[:, 1], points_xyz[:, 2]
        theta = np.arccos(np.clip(z, -1, 1))
        phi = np.arctan2(y, x)
        phi = np.mod(phi, 2 * np.pi)

        # Filter to northern hemisphere
        mask = (theta <= (np.pi / 2 - self.cutoff_theta_rad)) & (theta >= 0)

        phi = phi[mask]
        theta = theta[mask]
        points_xyz = points_xyz[mask]

        if len(points_xyz) < 4:
            raise ValueError("Not enough points in hemisphere for Voronoi tessellation")

        # Compute spherical Voronoi tessellation
        try:
            from scipy.spatial import SphericalVoronoi

            sv = SphericalVoronoi(points_xyz, radius=1, threshold=1e-10)
            sv.sort_vertices_of_regions()

        except ImportError:
            raise ImportError(
                "scipy required for Fibonacci grid. Install: pip install scipy"
            )

        # Create cells
        cells = []
        for point_idx, (p_phi, p_theta) in enumerate(zip(phi, theta)):
            region_vertices = sv.regions[point_idx]

            if -1 in region_vertices:
                continue

            region_coords = sv.vertices[region_vertices]

            # Convert region vertices to spherical
            rv_x, rv_y, rv_z = (
                region_coords[:, 0],
                region_coords[:, 1],
                region_coords[:, 2],
            )
            rv_theta = np.arccos(np.clip(rv_z, -1, 1))
            rv_phi = np.arctan2(rv_y, rv_x)
            rv_phi = np.mod(rv_phi, 2 * np.pi)

            cells.append(
                {
                    "phi": p_phi,
                    "theta": p_theta,
                    "phi_min": np.min(rv_phi),
                    "phi_max": np.max(rv_phi),
                    "theta_min": np.min(rv_theta),
                    "theta_max": np.max(rv_theta),
                    "voronoi_region": (
                        region_vertices
                        if isinstance(region_vertices, list)
                        else region_vertices.tolist()
                    ),
                    "n_vertices": len(region_vertices),
                }
            )

        grid = pl.DataFrame(cells).with_columns(
            pl.int_range(0, pl.len()).alias("cell_id")
        )

        extra_kwargs: dict[str, Any] = {
            "voronoi": sv,
            "points_xyz": points_xyz,
        }

        theta_lims = np.linspace(0, np.pi / 2, 10)
        phi_lims = [np.linspace(0, 2 * np.pi, 20) for _ in range(len(theta_lims))]
        cell_ids_list = [np.arange(grid.height)]

        return grid, theta_lims, phi_lims, cell_ids_list, extra_kwargs

    def _generate_fibonacci_sphere(self, n: int) -> np.ndarray:
        """Generate points on the unit sphere using the golden-spiral lattice.

        Parameters
        ----------
        n : int
            Total number of points on the full sphere.

        Returns
        -------
        points : np.ndarray
            Shape ``(n, 3)`` – Cartesian (x, y, z) coordinates on the unit
            sphere.

        """
        golden_ratio = (1 + np.sqrt(5)) / 2

        indices = np.arange(0, n, dtype=np.float64) + 0.5

        # Polar angle
        theta = np.arccos(1 - 2 * indices / n)

        # Azimuthal angle
        phi = 2 * np.pi * indices / golden_ratio
        phi = np.mod(phi, 2 * np.pi)

        # Convert to Cartesian
        x = np.sin(theta) * np.cos(phi)
        y = np.sin(theta) * np.sin(phi)
        z = np.cos(theta)

        return np.column_stack([x, y, z])

__init__(angular_resolution=2, cutoff_theta=0, n_points=None, phi_rotation=0)

Initialize the Fibonacci grid builder.

Parameters

angular_resolution : float, default 2 Angular resolution in degrees. cutoff_theta : float, default 0 Maximum polar angle cutoff in degrees. n_points : int | None, optional Number of points to generate. phi_rotation : float, default 0 Rotation angle in degrees.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/fibonacci_grid.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def __init__(
    self,
    angular_resolution: float = 2,
    cutoff_theta: float = 0,
    n_points: int | None = None,
    phi_rotation: float = 0,
) -> None:
    """Initialize the Fibonacci grid builder.

    Parameters
    ----------
    angular_resolution : float, default 2
        Angular resolution in degrees.
    cutoff_theta : float, default 0
        Maximum polar angle cutoff in degrees.
    n_points : int | None, optional
        Number of points to generate.
    phi_rotation : float, default 0
        Rotation angle in degrees.

    """
    super().__init__(angular_resolution, cutoff_theta, phi_rotation)

    if n_points is None:
        cell_area = self.angular_resolution_rad**2
        hemisphere_area = 2 * np.pi
        self.n_points = max(10, int(hemisphere_area / cell_area))
    else:
        self.n_points = n_points

    self._logger.info(f"Fibonacci: generating {self.n_points} points")

get_grid_type()

Return the grid-type identifier string.

Returns

str "fibonacci"

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/fibonacci_grid.py
135
136
137
138
139
140
141
142
143
144
def get_grid_type(self) -> str:
    """Return the grid-type identifier string.

    Returns
    -------
    str
        ``"fibonacci"``

    """
    return GridType.FIBONACCI.value

HTMBuilder

Bases: BaseGridBuilder

Hierarchical Triangular Mesh (HTM) grid.

HTM divides the sphere into an octahedron (8 triangular faces), then recursively subdivides each face into 4 smaller triangles by inserting edge-midpoint vertices projected onto the unit sphere. The recursion depth is controlled by htm_level. This produces a strictly hierarchical triangulation: every triangle at level n is the union of exactly 4 triangles at level n + 1.

Cell areas are approximately equal but not strictly so — area uniformity improves with level because the icosahedral edge-length asymmetry averages out over many subdivisions.

Coordinate convention (physics / GNSS)

  • phi ∈ [0, 2π) – azimuthal angle from North, clockwise (navigation convention)
  • theta ∈ [0, π/2] – polar angle from zenith (0 = straight up, π/2 = horizon)

Cell centres are the 3D Cartesian mean of the three triangle vertices, re-normalised onto the unit sphere.

What htm_level (resolution) means

The resolution is set by htm_level, not by angular_resolution. angular_resolution is used only to estimate an appropriate level when htm_level is not supplied explicitly. The heuristic is::

target_edge ≈ 2 × angular_resolution   (degrees)
htm_level   = min(15, ceil(log₂(90 / target_edge)))

The approximate triangle edge length at level n is::

edge ≈ 90° / 2ⁿ
Level Triangles (full sphere) Approx edge
0 8 90°
1 32 45°
2 128 22.5°
3 512 11.25°
4 2 048 5.6°
n 8 × 4ⁿ 90° / 2ⁿ

Mathematical construction

  1. Octahedron – 6 vertices at ±x, ±y, ±z on the unit sphere, forming 8 triangular faces (4 northern, 4 southern).
  2. Subdivision – for each triangle [v₀, v₁, v₂], three edge midpoints are computed and projected onto the unit sphere::

    m₀ = normalise((v₀ + v₁) / 2) m₁ = normalise((v₁ + v₂) / 2) m₂ = normalise((v₂ + v₀) / 2)

The four children are [v₀, m₀, m₂], [v₁, m₁, m₀], [v₂, m₂, m₁], and [m₀, m₁, m₂]. This is repeated htm_level times. 3. Hemisphere filter – a triangle is kept if any of its three vertices satisfies theta ≤ π/2 − cutoff_theta. Boundary triangles that straddle the horizon are therefore included and may extend slightly below it. 4. Each leaf triangle becomes one cell; its centre, bounding box, and three vertex coordinates are stored.

Parameters

angular_resolution : float Approximate angular resolution in degrees. Used only to derive htm_level when that parameter is not given explicitly. cutoff_theta : float Elevation mask angle in degrees. Triangles are excluded only when all their vertices are below this elevation. htm_level : int or None HTM subdivision depth. If None, estimated from angular_resolution. Practical range 0–15. phi_rotation : float Rigid azimuthal rotation applied after construction, in degrees.

Notes

The theta_lims, phi_lims, and cell_ids fields of the returned GridData are synthetic evenly-spaced arrays kept only for interface compatibility with ring-based grids. They do not describe the actual triangular cell layout.

HTM IDs in this implementation use a decimal-digit scheme (parent_id × 10 + child_index) which diverges from the original SDSS HTM binary-coded ID scheme. This is adequate for indexing but should not be compared with external HTM catalogues.

References

Kunszt et al. (2001): "The Hierarchical Triangular Mesh" https://www.sdss.org/dr12/algorithms/htm/

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/htm_grid.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
class HTMBuilder(BaseGridBuilder):
    """Hierarchical Triangular Mesh (HTM) grid.

    HTM divides the sphere into an octahedron (8 triangular faces), then
    recursively subdivides each face into 4 smaller triangles by inserting
    edge-midpoint vertices projected onto the unit sphere.  The recursion
    depth is controlled by ``htm_level``.  This produces a strictly
    hierarchical triangulation: every triangle at level *n* is the union of
    exactly 4 triangles at level *n* + 1.

    Cell areas are *approximately* equal but not strictly so — area
    uniformity improves with level because the icosahedral edge-length
    asymmetry averages out over many subdivisions.

    Coordinate convention (physics / GNSS)
    ---------------------------------------
    * phi  ∈ [0, 2π)  – azimuthal angle from North, clockwise (navigation convention)
    * theta ∈ [0, π/2] – polar angle from zenith (0 = straight up,
      π/2 = horizon)

    Cell centres are the 3D Cartesian mean of the three triangle vertices,
    re-normalised onto the unit sphere.

    What ``htm_level`` (resolution) means
    --------------------------------------
    The resolution is set by ``htm_level``, **not** by ``angular_resolution``.
    ``angular_resolution`` is used only to *estimate* an appropriate level
    when ``htm_level`` is not supplied explicitly.  The heuristic is::

        target_edge ≈ 2 × angular_resolution   (degrees)
        htm_level   = min(15, ceil(log₂(90 / target_edge)))

    The approximate triangle edge length at level *n* is::

        edge ≈ 90° / 2ⁿ

    | Level | Triangles (full sphere) | Approx edge |
    |-------|-------------------------|-------------|
    | 0     | 8                       | 90°         |
    | 1     | 32                      | 45°         |
    | 2     | 128                     | 22.5°       |
    | 3     | 512                     | 11.25°      |
    | 4     | 2 048                   | 5.6°        |
    | n     | 8 × 4ⁿ                  | 90° / 2ⁿ   |

    Mathematical construction
    -------------------------
    1. **Octahedron** – 6 vertices at ±x, ±y, ±z on the unit sphere, forming
       8 triangular faces (4 northern, 4 southern).
    2. **Subdivision** – for each triangle [v₀, v₁, v₂], three edge
       midpoints are computed and projected onto the unit sphere::

           m₀ = normalise((v₀ + v₁) / 2)
           m₁ = normalise((v₁ + v₂) / 2)
           m₂ = normalise((v₂ + v₀) / 2)

       The four children are [v₀, m₀, m₂], [v₁, m₁, m₀], [v₂, m₂, m₁],
       and [m₀, m₁, m₂].  This is repeated ``htm_level`` times.
    3. **Hemisphere filter** – a triangle is kept if *any* of its three
       vertices satisfies ``theta ≤ π/2 − cutoff_theta``.  Boundary
       triangles that straddle the horizon are therefore included and may
       extend slightly below it.
    4. Each leaf triangle becomes one cell; its centre, bounding box, and
       three vertex coordinates are stored.

    Parameters
    ----------
    angular_resolution : float
        Approximate angular resolution in degrees.  Used only to derive
        ``htm_level`` when that parameter is not given explicitly.
    cutoff_theta : float
        Elevation mask angle in degrees.  Triangles are excluded only when
        *all* their vertices are below this elevation.
    htm_level : int or None
        HTM subdivision depth.  If ``None``, estimated from
        ``angular_resolution``.  Practical range 0–15.
    phi_rotation : float
        Rigid azimuthal rotation applied after construction, in degrees.

    Notes
    -----
    The ``theta_lims``, ``phi_lims``, and ``cell_ids`` fields of the returned
    ``GridData`` are *synthetic* evenly-spaced arrays kept only for interface
    compatibility with ring-based grids.  They do **not** describe the actual
    triangular cell layout.

    HTM IDs in this implementation use a decimal-digit scheme
    (``parent_id × 10 + child_index``) which diverges from the original
    SDSS HTM binary-coded ID scheme.  This is adequate for indexing but
    should not be compared with external HTM catalogues.

    References
    ----------
    Kunszt et al. (2001): "The Hierarchical Triangular Mesh"
    https://www.sdss.org/dr12/algorithms/htm/

    """

    def __init__(
        self,
        angular_resolution: float = 2,
        cutoff_theta: float = 0,
        htm_level: int | None = None,
        phi_rotation: float = 0,
    ) -> None:
        """Initialize the HTM grid builder.

        Parameters
        ----------
        angular_resolution : float, default 2
            Angular resolution in degrees.
        cutoff_theta : float, default 0
            Maximum polar angle cutoff in degrees.
        htm_level : int | None, optional
            HTM subdivision level.
        phi_rotation : float, default 0
            Rotation angle in degrees.

        """
        super().__init__(angular_resolution, cutoff_theta, phi_rotation)

        if htm_level is None:
            target_edge_deg = angular_resolution * 2
            self.htm_level = max(
                0,
                int(np.ceil(np.log2(90 / target_edge_deg))),
            )
            self.htm_level = min(self.htm_level, 15)
        else:
            self.htm_level = htm_level

        self._logger.info(
            f"HTM: level={self.htm_level}, ~{8 * 4**self.htm_level} triangles"
        )

    def get_grid_type(self) -> str:
        """Return the grid-type identifier string.

        Returns
        -------
        str
            ``"htm"``

        """
        return GridType.HTM.value

    def _build_grid(
        self,
    ) -> tuple[pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray]]:
        """Build HTM grid by recursive octahedron subdivision.

        Returns
        -------
        grid : pl.DataFrame
            One row per triangular cell.  Contains phi, theta (centre),
            bounding-box limits, ``htm_id``, ``htm_level``, and the three
            vertex coordinate columns ``htm_vertex_0/1/2`` (each a list of
            3 floats in Cartesian xyz).
        theta_lims : np.ndarray
            Synthetic evenly-spaced theta limits (interface compatibility only).
        phi_lims : list[np.ndarray]
            Synthetic evenly-spaced phi limits (interface compatibility only).
        cell_ids : list[np.ndarray]
            Single-element list containing all cell ids.

        """
        base_vertices = np.array(
            [
                [0, 0, 1],  # 0: North pole
                [1, 0, 0],  # 1: +X
                [0, 1, 0],  # 2: +Y
                [-1, 0, 0],  # 3: -X
                [0, -1, 0],  # 4: -Y
                [0, 0, -1],  # 5: South pole
            ],
            dtype=np.float64,
        )

        base_faces = [
            [0, 1, 2],
            [0, 2, 3],
            [0, 3, 4],
            [0, 4, 1],  # Northern
            [5, 2, 1],
            [5, 3, 2],
            [5, 4, 3],
            [5, 1, 4],  # Southern
        ]

        all_triangles = []
        all_htm_ids = []

        for base_idx, base_face in enumerate(base_faces):
            v0 = base_vertices[base_face[0]]
            v1 = base_vertices[base_face[1]]
            v2 = base_vertices[base_face[2]]

            triangles, ids = self._subdivide_htm([v0, v1, v2], base_idx, self.htm_level)
            all_triangles.extend(triangles)
            all_htm_ids.extend(ids)

        # Convert to cells
        cells = []
        for tri, htm_id in zip(all_triangles, all_htm_ids):
            v0, v1, v2 = tri

            # Center
            center = (v0 + v1 + v2) / 3
            center = center / np.linalg.norm(center)

            theta_center = np.arccos(np.clip(center[2], -1, 1))
            phi_center = np.arctan2(center[1], center[0])
            phi_center = np.mod(phi_center, 2 * np.pi)

            # Filter hemisphere
            vertex_thetas = [np.arccos(np.clip(v[2], -1, 1)) for v in [v0, v1, v2]]
            if all(t > (np.pi / 2 - self.cutoff_theta_rad) for t in vertex_thetas):
                continue

            # Vertex coords
            thetas, phis = [], []
            for v in [v0, v1, v2]:
                t = np.arccos(np.clip(v[2], -1, 1))
                p = np.arctan2(v[1], v[0])
                p = np.mod(p, 2 * np.pi)
                thetas.append(t)
                phis.append(p)

            cells.append(
                {
                    "phi": phi_center,
                    "theta": theta_center,
                    "phi_min": min(phis),
                    "phi_max": max(phis),
                    "theta_min": min(thetas),
                    "theta_max": max(thetas),
                    "htm_id": htm_id,
                    "htm_level": self.htm_level,
                    "htm_vertex_0": v0.tolist(),
                    "htm_vertex_1": v1.tolist(),
                    "htm_vertex_2": v2.tolist(),
                }
            )

        grid = pl.DataFrame(cells).with_columns(
            pl.int_range(0, pl.len()).alias("cell_id")
        )

        theta_lims = np.linspace(0, np.pi / 2, 10)
        phi_lims = [np.linspace(0, 2 * np.pi, 20) for _ in range(len(theta_lims))]
        cell_ids_list = [grid["cell_id"].to_numpy()]

        return grid, theta_lims, phi_lims, cell_ids_list

    def _subdivide_htm(
        self,
        tri: list,
        htm_id: int,
        target_level: int,
        current_level: int = 0,
    ) -> tuple[list, list]:
        """Recursively subdivide a single triangle.

        Parameters
        ----------
        tri : list of np.ndarray
            Three vertex arrays [v₀, v₁, v₂], each shape ``(3,)``.
        htm_id : int
            Current HTM identifier for this triangle.
        target_level : int
            Recursion depth to reach.
        current_level : int
            Current recursion depth.

        Returns
        -------
        triangles : list of list
            Leaf triangles at ``target_level``.
        ids : list of int
            Corresponding HTM identifiers.

        """
        if current_level == target_level:
            return [tri], [htm_id]

        v0, v1, v2 = tri

        # Midpoints on sphere
        m0 = (v0 + v1) / 2
        m0 = m0 / np.linalg.norm(m0)
        m1 = (v1 + v2) / 2
        m1 = m1 / np.linalg.norm(m1)
        m2 = (v2 + v0) / 2
        m2 = m2 / np.linalg.norm(m2)

        # 4 children
        children = [[v0, m0, m2], [v1, m1, m0], [v2, m2, m1], [m0, m1, m2]]

        all_tris = []
        all_ids = []

        for child_idx, child in enumerate(children):
            child_id = htm_id * 10 + child_idx
            tris, ids = self._subdivide_htm(
                child,
                child_id,
                target_level,
                current_level + 1,
            )
            all_tris.extend(tris)
            all_ids.extend(ids)

        return all_tris, all_ids

    def get_htm_info(self) -> dict:
        """Get HTM-specific information.

        Returns
        -------
        info : dict
            Keys: ``htm_level``, ``n_triangles_full_sphere``,
            ``approx_edge_length_deg``, ``approx_edge_length_arcmin``.

        """
        n_triangles = 8 * 4**self.htm_level
        approx_edge_deg = 90 / (2**self.htm_level)

        return {
            "htm_level": self.htm_level,
            "n_triangles_full_sphere": n_triangles,
            "approx_edge_length_deg": approx_edge_deg,
            "approx_edge_length_arcmin": approx_edge_deg * 60,
        }

__init__(angular_resolution=2, cutoff_theta=0, htm_level=None, phi_rotation=0)

Initialize the HTM grid builder.

Parameters

angular_resolution : float, default 2 Angular resolution in degrees. cutoff_theta : float, default 0 Maximum polar angle cutoff in degrees. htm_level : int | None, optional HTM subdivision level. phi_rotation : float, default 0 Rotation angle in degrees.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/htm_grid.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def __init__(
    self,
    angular_resolution: float = 2,
    cutoff_theta: float = 0,
    htm_level: int | None = None,
    phi_rotation: float = 0,
) -> None:
    """Initialize the HTM grid builder.

    Parameters
    ----------
    angular_resolution : float, default 2
        Angular resolution in degrees.
    cutoff_theta : float, default 0
        Maximum polar angle cutoff in degrees.
    htm_level : int | None, optional
        HTM subdivision level.
    phi_rotation : float, default 0
        Rotation angle in degrees.

    """
    super().__init__(angular_resolution, cutoff_theta, phi_rotation)

    if htm_level is None:
        target_edge_deg = angular_resolution * 2
        self.htm_level = max(
            0,
            int(np.ceil(np.log2(90 / target_edge_deg))),
        )
        self.htm_level = min(self.htm_level, 15)
    else:
        self.htm_level = htm_level

    self._logger.info(
        f"HTM: level={self.htm_level}, ~{8 * 4**self.htm_level} triangles"
    )

get_grid_type()

Return the grid-type identifier string.

Returns

str "htm"

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/htm_grid.py
144
145
146
147
148
149
150
151
152
153
def get_grid_type(self) -> str:
    """Return the grid-type identifier string.

    Returns
    -------
    str
        ``"htm"``

    """
    return GridType.HTM.value

get_htm_info()

Get HTM-specific information.

Returns

info : dict Keys: htm_level, n_triangles_full_sphere, approx_edge_length_deg, approx_edge_length_arcmin.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/htm_grid.py
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
def get_htm_info(self) -> dict:
    """Get HTM-specific information.

    Returns
    -------
    info : dict
        Keys: ``htm_level``, ``n_triangles_full_sphere``,
        ``approx_edge_length_deg``, ``approx_edge_length_arcmin``.

    """
    n_triangles = 8 * 4**self.htm_level
    approx_edge_deg = 90 / (2**self.htm_level)

    return {
        "htm_level": self.htm_level,
        "n_triangles_full_sphere": n_triangles,
        "approx_edge_length_deg": approx_edge_deg,
        "approx_edge_length_arcmin": approx_edge_deg * 60,
    }

CellAggregator

Polars-based per-cell aggregation helpers.

Source code in packages/canvod-grids/src/canvod/grids/aggregation.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
class CellAggregator:
    """Polars-based per-cell aggregation helpers."""

    @staticmethod
    def aggregate_by_cell(
        df: pl.DataFrame,
        value_var: str = "VOD",
        method: str = "mean",
    ) -> pl.DataFrame:
        """Aggregate values by ``cell_id``.

        Parameters
        ----------
        df : pl.DataFrame
            Must contain ``cell_id`` and *value_var* columns.
        value_var : str
            Column to aggregate.
        method : {'mean', 'median', 'std', 'count'}
            Aggregation method.

        Returns
        -------
        pl.DataFrame
            Two-column DataFrame: ``cell_id``, *value_var*.

        """
        if "cell_id" not in df.columns:
            raise ValueError("pl.DataFrame must have 'cell_id' column")
        if value_var not in df.columns:
            raise ValueError(f"pl.DataFrame must have '{value_var}' column")

        agg_map = {
            "mean": pl.col(value_var).mean(),
            "median": pl.col(value_var).median(),
            "std": pl.col(value_var).std(),
            "count": pl.col(value_var).count(),
        }
        if method not in agg_map:
            raise ValueError(f"Unknown method: {method}")

        return (
            df.group_by("cell_id").agg(agg_map[method].alias(value_var)).sort("cell_id")
        )

aggregate_by_cell(df, value_var='VOD', method='mean') staticmethod

Aggregate values by cell_id.

Parameters

df : pl.DataFrame Must contain cell_id and value_var columns. value_var : str Column to aggregate. method : {'mean', 'median', 'std', 'count'} Aggregation method.

Returns

pl.DataFrame Two-column DataFrame: cell_id, value_var.

Source code in packages/canvod-grids/src/canvod/grids/aggregation.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
@staticmethod
def aggregate_by_cell(
    df: pl.DataFrame,
    value_var: str = "VOD",
    method: str = "mean",
) -> pl.DataFrame:
    """Aggregate values by ``cell_id``.

    Parameters
    ----------
    df : pl.DataFrame
        Must contain ``cell_id`` and *value_var* columns.
    value_var : str
        Column to aggregate.
    method : {'mean', 'median', 'std', 'count'}
        Aggregation method.

    Returns
    -------
    pl.DataFrame
        Two-column DataFrame: ``cell_id``, *value_var*.

    """
    if "cell_id" not in df.columns:
        raise ValueError("pl.DataFrame must have 'cell_id' column")
    if value_var not in df.columns:
        raise ValueError(f"pl.DataFrame must have '{value_var}' column")

    agg_map = {
        "mean": pl.col(value_var).mean(),
        "median": pl.col(value_var).median(),
        "std": pl.col(value_var).std(),
        "count": pl.col(value_var).count(),
    }
    if method not in agg_map:
        raise ValueError(f"Unknown method: {method}")

    return (
        df.group_by("cell_id").agg(agg_map[method].alias(value_var)).sort("cell_id")
    )

AdaptedVODWorkflow

Core VOD analysis workflow with polars-optimised loading and refined temporal matching.

All heavy lifting (filtering, grid operations) is delegated to canvod.grids.analysis. This class is responsible only for Icechunk I/O and orchestration.

Parameters

vod_store_path : Path or str Path to the VOD Icechunk store directory.

Source code in packages/canvod-grids/src/canvod/grids/workflows/adapted_workflow.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
class AdaptedVODWorkflow:
    """Core VOD analysis workflow with polars-optimised loading and refined
    temporal matching.

    All heavy lifting (filtering, grid operations) is delegated to
    ``canvod.grids.analysis``.  This class is responsible only for
    Icechunk I/O and orchestration.

    Parameters
    ----------
    vod_store_path : Path or str
        Path to the VOD Icechunk store directory.

    """

    def __init__(self, vod_store_path: Path | str) -> None:
        """Initialize the workflow.

        Parameters
        ----------
        vod_store_path : Path | str
            Path to the VOD Icechunk store directory.

        """
        self.vod_store_path = Path(vod_store_path)
        self.vod_store: MyIcechunkStore = _get_store(self.vod_store_path)

    # ------------------------------------------------------------------
    # Data loading
    # ------------------------------------------------------------------

    def load_vod_data(
        self,
        group_name: str = "reference_01_canopy_01",
        branch: str = "main",
    ) -> xr.Dataset:
        """Load a VOD dataset from the store.

        Parameters
        ----------
        group_name : str
            Zarr group path inside the store.
        branch : str
            Icechunk branch to read from.

        Returns
        -------
        xr.Dataset
            Lazy-loaded VOD dataset.

        """
        logger.info("Loading VOD data from branch=%s group=%s", branch, group_name)
        with self.vod_store.readonly_session(branch=branch) as session:
            vod_ds = xr.open_zarr(session.store, group=group_name, consolidated=False)
        logger.info("Loaded VOD dataset: %s", dict(vod_ds.sizes))
        return vod_ds

    # ------------------------------------------------------------------
    # Temporal coverage checks
    # ------------------------------------------------------------------

    def check_temporal_coverage_compatibility(
        self,
        main_ds: xr.Dataset,
        processed_ds: xr.Dataset,
        requested_time_range: tuple[datetime.date, datetime.date] | None = None,
    ) -> tuple[bool, dict[str, Any]]:
        """Check whether *processed_ds* adequately covers a time range.

        When *requested_time_range* is ``None`` the method checks that the
        processed dataset covers at least 70 % of the main dataset's span.
        When a range is given it verifies that both endpoints fall within the
        processed dataset (with a 1-day tolerance).

        Parameters
        ----------
        main_ds : xr.Dataset
            Reference (unfiltered) dataset.
        processed_ds : xr.Dataset
            Filtered dataset to validate.
        requested_time_range : tuple of date, optional
            ``(start, end)`` to check against.

        Returns
        -------
        compatible : bool
        coverage_info : dict
            Diagnostic information with ``main_range``, ``processed_range``,
            and ``requested_range``.

        """

        def _date_range(ds: xr.Dataset) -> tuple[datetime.date, datetime.date]:
            """Return the date range for a dataset.

            Parameters
            ----------
            ds : xr.Dataset
                Dataset with an epoch coordinate.

            Returns
            -------
            tuple[datetime.date, datetime.date]
                Start and end dates.

            """
            return (
                pd.to_datetime(ds.epoch.min().values).date(),
                pd.to_datetime(ds.epoch.max().values).date(),
            )

        main_start, main_end = _date_range(main_ds)
        proc_start, proc_end = _date_range(processed_ds)

        coverage_info: dict[str, Any] = {
            "main_range": (main_start, main_end),
            "processed_range": (proc_start, proc_end),
            "requested_range": requested_time_range,
        }

        if requested_time_range is None:
            main_days = (main_end - main_start).days
            proc_days = (proc_end - proc_start).days
            ratio = proc_days / main_days if main_days > 0 else 0.0
            logger.info(
                "Coverage check: main=%d days, processed=%d days, ratio=%.1f%%",
                main_days,
                proc_days,
                ratio * 100,
            )
            return ratio >= 0.7, coverage_info

        req_start, req_end = requested_time_range
        one_day = datetime.timedelta(days=1)
        start_ok = proc_start <= req_start <= proc_end + one_day
        end_ok = proc_start - one_day <= req_end <= proc_end
        compatible = start_ok and end_ok

        if not compatible:
            logger.warning(
                "Temporal coverage mismatch: processed=%s%s, requested=%s%s",
                proc_start,
                proc_end,
                req_start,
                req_end,
            )
        return compatible, coverage_info

    # ------------------------------------------------------------------
    # Filtering entry-points
    # ------------------------------------------------------------------

    def create_processed_data_fast_hampel_complete(
        self,
        start_date: datetime.date | datetime.datetime,
        end_date: datetime.date | datetime.datetime,
        force_recreate: bool = False,
        window_hours: float = 1.0,
        sigma_threshold: float = 3.0,
        min_points: int = 5,
        ultra_fast_mode: bool = False,
        cell_batch_size: int = 200,
        n_workers: int | None = None,
    ) -> str | None:
        """Run the vectorised / ultra-fast Hampel pipeline end-to-end.

        Delegates the actual filtering to
        :func:`canvod.grids.analysis.sigma_clip_filter.astropy_hampel_vectorized_fast`
        (or its ultra-fast variant) and persists the result on a
        ``processing`` branch.

        Parameters
        ----------
        start_date, end_date : date or datetime
            Temporal extent to process.
        force_recreate : bool
            Overwrite existing filtered data.
        window_hours : float
            Hampel temporal window in hours.
        sigma_threshold : float
            MAD-based outlier threshold.
        min_points : int
            Minimum observations required per window.
        ultra_fast_mode : bool
            Use the pure-NumPy sigma-clip path (faster, less precise).
        cell_batch_size : int
            Number of cells per spatial batch.
        n_workers : int, optional
            Parallel workers.  ``None`` → auto-detect.

        Returns
        -------
        str or None
            Icechunk snapshot ID, or ``None`` if existing data was kept.

        """
        return _create_processed_data_fast_hampel(
            workflow_instance=self,
            start_date=start_date,
            end_date=end_date,
            force_recreate=force_recreate,
            window_hours=window_hours,
            sigma_threshold=sigma_threshold,
            min_points=min_points,
            ultra_fast_mode=ultra_fast_mode,
            cell_batch_size=cell_batch_size,
            n_workers=n_workers,
        )

    def create_processed_data_hampel_parallel_complete(
        self,
        start_date: datetime.date | datetime.datetime,
        end_date: datetime.date | datetime.datetime,
        force_recreate: bool = False,
        threshold: float = 3.0,
        min_obs_per_sid: int = 20,
        spatial_batch_size: int = 500,
        n_workers: int | None = None,
        temporal_agg: str | None = None,
        agg_method: str | None = None,
    ) -> str | None:
        """Run the parallelised cell-SID Hampel pipeline end-to-end.

        Loads the complete requested time range (no temporal chunking) and
        applies
        :func:`canvod.grids.analysis.hampel_filtering.aggr_hampel_cell_sid_parallelized`
        with spatial batching.

        Parameters
        ----------
        start_date, end_date : date or datetime
            Temporal extent to process.
        force_recreate : bool
            Overwrite existing filtered data.
        threshold : float
            MAD-based outlier threshold.
        min_obs_per_sid : int
            Minimum observations per cell-SID combination.
        spatial_batch_size : int
            Cells per spatial batch.
        n_workers : int, optional
            Parallel workers.  ``None`` → auto-detect.
        temporal_agg : str, optional
            Post-filtering aggregation frequency (e.g. ``'1H'``, ``'1D'``).
        agg_method : str, optional
            Aggregation method (e.g. ``'mean'``).

        Returns
        -------
        str or None
            Icechunk snapshot ID, or ``None`` if existing data was kept.

        """
        from canvod.grids import create_hemigrid
        from canvod.grids.analysis.hampel_filtering import (
            aggr_hampel_cell_sid_parallelized,
        )
        from canvod.grids.operations import add_cell_ids_to_ds_fast

        logger.info("=" * 60)
        logger.info("PARALLEL HAMPEL — complete temporal coverage")
        logger.info(
            "Range: %s%s | threshold=%.1f | min_obs=%d | batch=%d | workers=%s",
            start_date,
            end_date,
            threshold,
            min_obs_per_sid,
            spatial_batch_size,
            n_workers or "auto",
        )

        # --- guard: existing data ---
        if not self._force_or_skip("processing", force_recreate):
            return None

        # --- load complete time range ---
        logger.info("Loading complete time range for parallel processing")
        with self.vod_store.readonly_session(branch="main") as session:
            vod_ds = xr.open_zarr(session.store, group="reference_01_canopy_01")

        vod_ds_complete = vod_ds.sel(epoch=slice(start_date, end_date))

        if "cell_id_equal_area_2deg" not in vod_ds_complete:
            grid = create_hemigrid(grid_type="equal_area", angular_resolution=2)
            vod_ds_complete = add_cell_ids_to_ds_fast(
                vod_ds_complete, grid, "equal_area_2deg", data_var="VOD"
            )

        logger.info("Dataset loaded: %s", dict(vod_ds_complete.sizes))

        # --- filter ---
        t0 = time.time()
        vod_ds_filtered = aggr_hampel_cell_sid_parallelized(
            vod_ds_complete,
            threshold=threshold,
            min_obs_per_sid=min_obs_per_sid,
            spatial_batch_size=spatial_batch_size,
            n_workers=n_workers,
            temporal_agg=temporal_agg,
            agg_method=agg_method,
        )
        logger.info("Parallel filtering completed in %.1f s", time.time() - t0)

        # --- persist ---
        snapshot_id = self._persist_filtered(
            vod_ds_filtered,
            "parallel Cell-SID Hampel",
        )
        logger.info("Parallel Hampel complete. Snapshot: %s", snapshot_id)
        return snapshot_id

    # ------------------------------------------------------------------
    # High-level orchestration
    # ------------------------------------------------------------------

    def run_complete_workflow(
        self,
        group_name: str = "reference_01_canopy_01",
        branch: str = "auto",
        time_range: tuple[datetime.date, datetime.date] | None = None,
        **kwargs: Any,
    ) -> dict[str, Any]:
        """Orchestrate a complete analysis run.

        Auto-detection logic (``branch='auto'``) looks for Hampel-filtered
        data on the ``processing`` branch first.  If found and temporally
        compatible it is used directly; otherwise raw data from ``main`` is
        returned.

        Parameters
        ----------
        group_name : str
            Zarr group for the raw VOD data.
        branch : str
            ``'auto'`` for detection, or an explicit branch name.
        time_range : tuple of date, optional
            ``(start, end)`` to select.

        Returns
        -------
        dict
            Keys: ``final_data`` (Dataset), ``source_branch``,
            ``pre_filtered`` (bool), ``filter_type``.

        """
        logger.info("=" * 60)
        logger.info("HAMPEL-FILTERED VOD ANALYSIS WORKFLOW")
        logger.info("branch=%s group=%s time_range=%s", branch, group_name, time_range)

        results: dict[str, Any] = {}

        if branch == "auto":
            hampel_ds = self._try_load_hampel()

            if hampel_ds is not None:
                # Validate temporal coverage when a range is requested
                if time_range is not None:
                    dataset_start = pd.to_datetime(hampel_ds.epoch.min().values).date()
                    dataset_end = pd.to_datetime(hampel_ds.epoch.max().values).date()

                    start_ok = normalize_datetime_for_comparison(
                        time_range[0]
                    ) >= normalize_datetime_for_comparison(dataset_start)
                    end_ok = normalize_datetime_for_comparison(
                        time_range[1]
                    ) <= normalize_datetime_for_comparison(dataset_end)

                    if start_ok and end_ok:
                        hampel_ds = hampel_ds.sel(
                            epoch=slice(time_range[0], time_range[1])
                        )
                    else:
                        logger.warning(
                            "Hampel data (%s%s) does not cover requested "
                            "range (%s%s); falling back to main branch",
                            dataset_start,
                            dataset_end,
                            time_range[0],
                            time_range[1],
                        )
                        hampel_ds = None

                if hampel_ds is not None:
                    logger.info("Using Hampel filtered data from processing branch")
                    return {
                        "final_data": hampel_ds,
                        "source_branch": "processing",
                        "pre_filtered": True,
                        "filter_type": "hampel",
                    }

            logger.info("No usable Hampel data found; using raw data from main branch")
            branch = "main"

        # --- main branch (raw) ---
        logger.info("Loading raw data from branch=%s", branch)
        vod_ds = self.load_vod_data(group_name, branch)

        if time_range is not None:
            vod_ds = vod_ds.sel(epoch=slice(time_range[0], time_range[1]))

        results = {
            "final_data": vod_ds,
            "source_branch": branch,
            "pre_filtered": False,
            "filter_type": "none",
        }
        logger.info("Workflow complete — source=%s", branch)
        return results

    # ------------------------------------------------------------------
    # Private helpers
    # ------------------------------------------------------------------

    def _try_load_hampel(self) -> xr.Dataset | None:
        """Attempt to load Hampel-filtered data from the processing branch."""
        try:
            with self.vod_store.readonly_session(branch="processing") as session:
                ds = xr.open_zarr(
                    session.store,
                    group="reference_01_canopy_01_hampel_filtered",
                    consolidated=False,
                )
            logger.info("Found Hampel filtered data on processing branch")
            return ds
        except Exception:
            logger.debug("No Hampel data on processing branch", exc_info=True)
            return None

    def _force_or_skip(self, branch: str, force_recreate: bool) -> bool:
        """Guard pattern: return ``True`` to proceed, ``False`` to skip.

        If filtered data already exists and *force_recreate* is ``False``
        the method logs a warning and returns ``False``.  When
        *force_recreate* is ``True`` it deletes the branch first.
        """
        exists = self._try_load_hampel() is not None
        if exists and not force_recreate:
            logger.warning(
                "Filtered data already exists. Pass force_recreate=True to overwrite."
            )
            return False
        if exists and force_recreate:
            try:
                self.vod_store.delete_branch(branch)
                logger.info("Deleted existing %s branch", branch)
            except Exception:
                logger.warning("Could not delete branch %s", branch, exc_info=True)
        return True

    def _persist_filtered(
        self,
        ds: xr.Dataset,
        label: str,
        target_group: str = "reference_01_canopy_01_hampel_filtered",
    ) -> str:
        """Write a filtered dataset to the ``processing`` branch.

        Rechunks variables along the ``epoch`` dimension (max 50 000 epochs
        per chunk) before writing.

        Returns the Icechunk snapshot ID.
        """
        from icechunk.xarray import to_icechunk

        # Ensure processing branch exists
        try:
            current_snapshot = next(self.vod_store.repo.ancestry(branch="main")).id
            self.vod_store.repo.create_branch("processing", current_snapshot)
        except Exception:
            pass  # branch may already exist

        with self.vod_store.writable_session("processing") as session:
            logger.info("Persisting filtered data (%s)", label)
            for var_name in ds.data_vars:
                if "epoch" in ds[var_name].dims:
                    epoch_size = ds[var_name].sizes["epoch"]
                    ds[var_name] = ds[var_name].chunk(
                        {"epoch": min(epoch_size, 50000), "sid": -1}
                    )
            to_icechunk(ds, session, group=target_group, mode="w", safe_chunks=False)
            snapshot_id: str = session.commit(label)

        return snapshot_id

__init__(vod_store_path)

Initialize the workflow.

Parameters

vod_store_path : Path | str Path to the VOD Icechunk store directory.

Source code in packages/canvod-grids/src/canvod/grids/workflows/adapted_workflow.py
 96
 97
 98
 99
100
101
102
103
104
105
106
def __init__(self, vod_store_path: Path | str) -> None:
    """Initialize the workflow.

    Parameters
    ----------
    vod_store_path : Path | str
        Path to the VOD Icechunk store directory.

    """
    self.vod_store_path = Path(vod_store_path)
    self.vod_store: MyIcechunkStore = _get_store(self.vod_store_path)

load_vod_data(group_name='reference_01_canopy_01', branch='main')

Load a VOD dataset from the store.

Parameters

group_name : str Zarr group path inside the store. branch : str Icechunk branch to read from.

Returns

xr.Dataset Lazy-loaded VOD dataset.

Source code in packages/canvod-grids/src/canvod/grids/workflows/adapted_workflow.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def load_vod_data(
    self,
    group_name: str = "reference_01_canopy_01",
    branch: str = "main",
) -> xr.Dataset:
    """Load a VOD dataset from the store.

    Parameters
    ----------
    group_name : str
        Zarr group path inside the store.
    branch : str
        Icechunk branch to read from.

    Returns
    -------
    xr.Dataset
        Lazy-loaded VOD dataset.

    """
    logger.info("Loading VOD data from branch=%s group=%s", branch, group_name)
    with self.vod_store.readonly_session(branch=branch) as session:
        vod_ds = xr.open_zarr(session.store, group=group_name, consolidated=False)
    logger.info("Loaded VOD dataset: %s", dict(vod_ds.sizes))
    return vod_ds

check_temporal_coverage_compatibility(main_ds, processed_ds, requested_time_range=None)

Check whether processed_ds adequately covers a time range.

When requested_time_range is None the method checks that the processed dataset covers at least 70 % of the main dataset's span. When a range is given it verifies that both endpoints fall within the processed dataset (with a 1-day tolerance).

Parameters

main_ds : xr.Dataset Reference (unfiltered) dataset. processed_ds : xr.Dataset Filtered dataset to validate. requested_time_range : tuple of date, optional (start, end) to check against.

Returns

compatible : bool coverage_info : dict Diagnostic information with main_range, processed_range, and requested_range.

Source code in packages/canvod-grids/src/canvod/grids/workflows/adapted_workflow.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
def check_temporal_coverage_compatibility(
    self,
    main_ds: xr.Dataset,
    processed_ds: xr.Dataset,
    requested_time_range: tuple[datetime.date, datetime.date] | None = None,
) -> tuple[bool, dict[str, Any]]:
    """Check whether *processed_ds* adequately covers a time range.

    When *requested_time_range* is ``None`` the method checks that the
    processed dataset covers at least 70 % of the main dataset's span.
    When a range is given it verifies that both endpoints fall within the
    processed dataset (with a 1-day tolerance).

    Parameters
    ----------
    main_ds : xr.Dataset
        Reference (unfiltered) dataset.
    processed_ds : xr.Dataset
        Filtered dataset to validate.
    requested_time_range : tuple of date, optional
        ``(start, end)`` to check against.

    Returns
    -------
    compatible : bool
    coverage_info : dict
        Diagnostic information with ``main_range``, ``processed_range``,
        and ``requested_range``.

    """

    def _date_range(ds: xr.Dataset) -> tuple[datetime.date, datetime.date]:
        """Return the date range for a dataset.

        Parameters
        ----------
        ds : xr.Dataset
            Dataset with an epoch coordinate.

        Returns
        -------
        tuple[datetime.date, datetime.date]
            Start and end dates.

        """
        return (
            pd.to_datetime(ds.epoch.min().values).date(),
            pd.to_datetime(ds.epoch.max().values).date(),
        )

    main_start, main_end = _date_range(main_ds)
    proc_start, proc_end = _date_range(processed_ds)

    coverage_info: dict[str, Any] = {
        "main_range": (main_start, main_end),
        "processed_range": (proc_start, proc_end),
        "requested_range": requested_time_range,
    }

    if requested_time_range is None:
        main_days = (main_end - main_start).days
        proc_days = (proc_end - proc_start).days
        ratio = proc_days / main_days if main_days > 0 else 0.0
        logger.info(
            "Coverage check: main=%d days, processed=%d days, ratio=%.1f%%",
            main_days,
            proc_days,
            ratio * 100,
        )
        return ratio >= 0.7, coverage_info

    req_start, req_end = requested_time_range
    one_day = datetime.timedelta(days=1)
    start_ok = proc_start <= req_start <= proc_end + one_day
    end_ok = proc_start - one_day <= req_end <= proc_end
    compatible = start_ok and end_ok

    if not compatible:
        logger.warning(
            "Temporal coverage mismatch: processed=%s%s, requested=%s%s",
            proc_start,
            proc_end,
            req_start,
            req_end,
        )
    return compatible, coverage_info

create_processed_data_fast_hampel_complete(start_date, end_date, force_recreate=False, window_hours=1.0, sigma_threshold=3.0, min_points=5, ultra_fast_mode=False, cell_batch_size=200, n_workers=None)

Run the vectorised / ultra-fast Hampel pipeline end-to-end.

Delegates the actual filtering to :func:canvod.grids.analysis.sigma_clip_filter.astropy_hampel_vectorized_fast (or its ultra-fast variant) and persists the result on a processing branch.

Parameters

start_date, end_date : date or datetime Temporal extent to process. force_recreate : bool Overwrite existing filtered data. window_hours : float Hampel temporal window in hours. sigma_threshold : float MAD-based outlier threshold. min_points : int Minimum observations required per window. ultra_fast_mode : bool Use the pure-NumPy sigma-clip path (faster, less precise). cell_batch_size : int Number of cells per spatial batch. n_workers : int, optional Parallel workers. None → auto-detect.

Returns

str or None Icechunk snapshot ID, or None if existing data was kept.

Source code in packages/canvod-grids/src/canvod/grids/workflows/adapted_workflow.py
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
def create_processed_data_fast_hampel_complete(
    self,
    start_date: datetime.date | datetime.datetime,
    end_date: datetime.date | datetime.datetime,
    force_recreate: bool = False,
    window_hours: float = 1.0,
    sigma_threshold: float = 3.0,
    min_points: int = 5,
    ultra_fast_mode: bool = False,
    cell_batch_size: int = 200,
    n_workers: int | None = None,
) -> str | None:
    """Run the vectorised / ultra-fast Hampel pipeline end-to-end.

    Delegates the actual filtering to
    :func:`canvod.grids.analysis.sigma_clip_filter.astropy_hampel_vectorized_fast`
    (or its ultra-fast variant) and persists the result on a
    ``processing`` branch.

    Parameters
    ----------
    start_date, end_date : date or datetime
        Temporal extent to process.
    force_recreate : bool
        Overwrite existing filtered data.
    window_hours : float
        Hampel temporal window in hours.
    sigma_threshold : float
        MAD-based outlier threshold.
    min_points : int
        Minimum observations required per window.
    ultra_fast_mode : bool
        Use the pure-NumPy sigma-clip path (faster, less precise).
    cell_batch_size : int
        Number of cells per spatial batch.
    n_workers : int, optional
        Parallel workers.  ``None`` → auto-detect.

    Returns
    -------
    str or None
        Icechunk snapshot ID, or ``None`` if existing data was kept.

    """
    return _create_processed_data_fast_hampel(
        workflow_instance=self,
        start_date=start_date,
        end_date=end_date,
        force_recreate=force_recreate,
        window_hours=window_hours,
        sigma_threshold=sigma_threshold,
        min_points=min_points,
        ultra_fast_mode=ultra_fast_mode,
        cell_batch_size=cell_batch_size,
        n_workers=n_workers,
    )

create_processed_data_hampel_parallel_complete(start_date, end_date, force_recreate=False, threshold=3.0, min_obs_per_sid=20, spatial_batch_size=500, n_workers=None, temporal_agg=None, agg_method=None)

Run the parallelised cell-SID Hampel pipeline end-to-end.

Loads the complete requested time range (no temporal chunking) and applies :func:canvod.grids.analysis.hampel_filtering.aggr_hampel_cell_sid_parallelized with spatial batching.

Parameters

start_date, end_date : date or datetime Temporal extent to process. force_recreate : bool Overwrite existing filtered data. threshold : float MAD-based outlier threshold. min_obs_per_sid : int Minimum observations per cell-SID combination. spatial_batch_size : int Cells per spatial batch. n_workers : int, optional Parallel workers. None → auto-detect. temporal_agg : str, optional Post-filtering aggregation frequency (e.g. '1H', '1D'). agg_method : str, optional Aggregation method (e.g. 'mean').

Returns

str or None Icechunk snapshot ID, or None if existing data was kept.

Source code in packages/canvod-grids/src/canvod/grids/workflows/adapted_workflow.py
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
def create_processed_data_hampel_parallel_complete(
    self,
    start_date: datetime.date | datetime.datetime,
    end_date: datetime.date | datetime.datetime,
    force_recreate: bool = False,
    threshold: float = 3.0,
    min_obs_per_sid: int = 20,
    spatial_batch_size: int = 500,
    n_workers: int | None = None,
    temporal_agg: str | None = None,
    agg_method: str | None = None,
) -> str | None:
    """Run the parallelised cell-SID Hampel pipeline end-to-end.

    Loads the complete requested time range (no temporal chunking) and
    applies
    :func:`canvod.grids.analysis.hampel_filtering.aggr_hampel_cell_sid_parallelized`
    with spatial batching.

    Parameters
    ----------
    start_date, end_date : date or datetime
        Temporal extent to process.
    force_recreate : bool
        Overwrite existing filtered data.
    threshold : float
        MAD-based outlier threshold.
    min_obs_per_sid : int
        Minimum observations per cell-SID combination.
    spatial_batch_size : int
        Cells per spatial batch.
    n_workers : int, optional
        Parallel workers.  ``None`` → auto-detect.
    temporal_agg : str, optional
        Post-filtering aggregation frequency (e.g. ``'1H'``, ``'1D'``).
    agg_method : str, optional
        Aggregation method (e.g. ``'mean'``).

    Returns
    -------
    str or None
        Icechunk snapshot ID, or ``None`` if existing data was kept.

    """
    from canvod.grids import create_hemigrid
    from canvod.grids.analysis.hampel_filtering import (
        aggr_hampel_cell_sid_parallelized,
    )
    from canvod.grids.operations import add_cell_ids_to_ds_fast

    logger.info("=" * 60)
    logger.info("PARALLEL HAMPEL — complete temporal coverage")
    logger.info(
        "Range: %s%s | threshold=%.1f | min_obs=%d | batch=%d | workers=%s",
        start_date,
        end_date,
        threshold,
        min_obs_per_sid,
        spatial_batch_size,
        n_workers or "auto",
    )

    # --- guard: existing data ---
    if not self._force_or_skip("processing", force_recreate):
        return None

    # --- load complete time range ---
    logger.info("Loading complete time range for parallel processing")
    with self.vod_store.readonly_session(branch="main") as session:
        vod_ds = xr.open_zarr(session.store, group="reference_01_canopy_01")

    vod_ds_complete = vod_ds.sel(epoch=slice(start_date, end_date))

    if "cell_id_equal_area_2deg" not in vod_ds_complete:
        grid = create_hemigrid(grid_type="equal_area", angular_resolution=2)
        vod_ds_complete = add_cell_ids_to_ds_fast(
            vod_ds_complete, grid, "equal_area_2deg", data_var="VOD"
        )

    logger.info("Dataset loaded: %s", dict(vod_ds_complete.sizes))

    # --- filter ---
    t0 = time.time()
    vod_ds_filtered = aggr_hampel_cell_sid_parallelized(
        vod_ds_complete,
        threshold=threshold,
        min_obs_per_sid=min_obs_per_sid,
        spatial_batch_size=spatial_batch_size,
        n_workers=n_workers,
        temporal_agg=temporal_agg,
        agg_method=agg_method,
    )
    logger.info("Parallel filtering completed in %.1f s", time.time() - t0)

    # --- persist ---
    snapshot_id = self._persist_filtered(
        vod_ds_filtered,
        "parallel Cell-SID Hampel",
    )
    logger.info("Parallel Hampel complete. Snapshot: %s", snapshot_id)
    return snapshot_id

run_complete_workflow(group_name='reference_01_canopy_01', branch='auto', time_range=None, **kwargs)

Orchestrate a complete analysis run.

Auto-detection logic (branch='auto') looks for Hampel-filtered data on the processing branch first. If found and temporally compatible it is used directly; otherwise raw data from main is returned.

Parameters

group_name : str Zarr group for the raw VOD data. branch : str 'auto' for detection, or an explicit branch name. time_range : tuple of date, optional (start, end) to select.

Returns

dict Keys: final_data (Dataset), source_branch, pre_filtered (bool), filter_type.

Source code in packages/canvod-grids/src/canvod/grids/workflows/adapted_workflow.py
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
def run_complete_workflow(
    self,
    group_name: str = "reference_01_canopy_01",
    branch: str = "auto",
    time_range: tuple[datetime.date, datetime.date] | None = None,
    **kwargs: Any,
) -> dict[str, Any]:
    """Orchestrate a complete analysis run.

    Auto-detection logic (``branch='auto'``) looks for Hampel-filtered
    data on the ``processing`` branch first.  If found and temporally
    compatible it is used directly; otherwise raw data from ``main`` is
    returned.

    Parameters
    ----------
    group_name : str
        Zarr group for the raw VOD data.
    branch : str
        ``'auto'`` for detection, or an explicit branch name.
    time_range : tuple of date, optional
        ``(start, end)`` to select.

    Returns
    -------
    dict
        Keys: ``final_data`` (Dataset), ``source_branch``,
        ``pre_filtered`` (bool), ``filter_type``.

    """
    logger.info("=" * 60)
    logger.info("HAMPEL-FILTERED VOD ANALYSIS WORKFLOW")
    logger.info("branch=%s group=%s time_range=%s", branch, group_name, time_range)

    results: dict[str, Any] = {}

    if branch == "auto":
        hampel_ds = self._try_load_hampel()

        if hampel_ds is not None:
            # Validate temporal coverage when a range is requested
            if time_range is not None:
                dataset_start = pd.to_datetime(hampel_ds.epoch.min().values).date()
                dataset_end = pd.to_datetime(hampel_ds.epoch.max().values).date()

                start_ok = normalize_datetime_for_comparison(
                    time_range[0]
                ) >= normalize_datetime_for_comparison(dataset_start)
                end_ok = normalize_datetime_for_comparison(
                    time_range[1]
                ) <= normalize_datetime_for_comparison(dataset_end)

                if start_ok and end_ok:
                    hampel_ds = hampel_ds.sel(
                        epoch=slice(time_range[0], time_range[1])
                    )
                else:
                    logger.warning(
                        "Hampel data (%s%s) does not cover requested "
                        "range (%s%s); falling back to main branch",
                        dataset_start,
                        dataset_end,
                        time_range[0],
                        time_range[1],
                    )
                    hampel_ds = None

            if hampel_ds is not None:
                logger.info("Using Hampel filtered data from processing branch")
                return {
                    "final_data": hampel_ds,
                    "source_branch": "processing",
                    "pre_filtered": True,
                    "filter_type": "hampel",
                }

        logger.info("No usable Hampel data found; using raw data from main branch")
        branch = "main"

    # --- main branch (raw) ---
    logger.info("Loading raw data from branch=%s", branch)
    vod_ds = self.load_vod_data(group_name, branch)

    if time_range is not None:
        vod_ds = vod_ds.sel(epoch=slice(time_range[0], time_range[1]))

    results = {
        "final_data": vod_ds,
        "source_branch": branch,
        "pre_filtered": False,
        "filter_type": "none",
    }
    logger.info("Workflow complete — source=%s", branch)
    return results

create_hemigrid(grid_type, angular_resolution=10.0, **kwargs)

Create hemisphere grid of specified type.

Factory function for creating various hemisphere grid types commonly used in GNSS analysis.

Parameters

grid_type : str Type of grid to create: - 'equal_area': Regular lat/lon grid with equal solid angle cells - 'equal_angle': Regular angular spacing (not recommended) - 'rectangular' or 'equirectangular': Simple rectangular grid - 'HTM': Hierarchical Triangular Mesh - 'geodesic': Geodesic sphere subdivision (icosahedron-based) - 'healpix': HEALPix grid (requires healpy) - 'fibonacci': Fibonacci sphere (requires scipy) angular_resolution : float, default 10.0 Angular resolution in degrees **kwargs Additional grid-specific parameters: - cutoff_theta : float - Maximum theta angle cutoff (degrees) - phi_rotation : float - Rotation angle (degrees) - subdivision_level : int - For geodesic/HTM grids - htm_level : int - For HTM grids specifically - nside : int - For HEALPix grids - n_points : int - For Fibonacci grids

Returns

GridData Complete hemisphere grid data structure

Examples

Equal area grid with 10° resolution

grid = create_hemigrid('equal_area', angular_resolution=10.0)

HTM grid with subdivision level 3

grid = create_hemigrid('HTM', angular_resolution=5.0, htm_level=3)

Geodesic grid

grid = create_hemigrid('geodesic', angular_resolution=5.0, subdivision_level=2)

Notes

Grid coordinates use navigation convention: - phi: azimuth angle, 0 to 2π (0 = North, π/2 = East, clockwise) - theta: polar angle from zenith, 0 to π/2 (0 = zenith, π/2 = horizon)

Source code in packages/canvod-grids/src/canvod/grids/__init__.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
def create_hemigrid(
    grid_type: Literal[
        "equal_area",
        "equal_angle",
        "rectangular",
        "equirectangular",
        "HTM",
        "geodesic",
        "healpix",
        "fibonacci",
    ],
    angular_resolution: float = 10.0,
    **kwargs: Any,
) -> GridData:
    """Create hemisphere grid of specified type.

    Factory function for creating various hemisphere grid types commonly
    used in GNSS analysis.

    Parameters
    ----------
    grid_type : str
        Type of grid to create:
        - 'equal_area': Regular lat/lon grid with equal solid angle cells
        - 'equal_angle': Regular angular spacing (not recommended)
        - 'rectangular' or 'equirectangular': Simple rectangular grid
        - 'HTM': Hierarchical Triangular Mesh
        - 'geodesic': Geodesic sphere subdivision (icosahedron-based)
        - 'healpix': HEALPix grid (requires healpy)
        - 'fibonacci': Fibonacci sphere (requires scipy)
    angular_resolution : float, default 10.0
        Angular resolution in degrees
    **kwargs
        Additional grid-specific parameters:
        - cutoff_theta : float - Maximum theta angle cutoff (degrees)
        - phi_rotation : float - Rotation angle (degrees)
        - subdivision_level : int - For geodesic/HTM grids
        - htm_level : int - For HTM grids specifically
        - nside : int - For HEALPix grids
        - n_points : int - For Fibonacci grids

    Returns
    -------
    GridData
        Complete hemisphere grid data structure

    Examples
    --------
    >>> # Equal area grid with 10° resolution
    >>> grid = create_hemigrid('equal_area', angular_resolution=10.0)
    >>>
    >>> # HTM grid with subdivision level 3
    >>> grid = create_hemigrid('HTM', angular_resolution=5.0, htm_level=3)
    >>>
    >>> # Geodesic grid
    >>> grid = create_hemigrid('geodesic', angular_resolution=5.0, subdivision_level=2)

    Notes
    -----
    Grid coordinates use navigation convention:
    - phi: azimuth angle, 0 to 2π (0 = North, π/2 = East, clockwise)
    - theta: polar angle from zenith, 0 to π/2 (0 = zenith, π/2 = horizon)

    """
    grid_type_lower = grid_type.lower()

    if grid_type_lower == "equal_area":
        builder = EqualAreaBuilder(
            angular_resolution=angular_resolution,
            **kwargs,
        )
    elif grid_type_lower == "equal_angle":
        builder = EqualAngleBuilder(
            angular_resolution=angular_resolution,
            **kwargs,
        )
    elif grid_type_lower in ["rectangular", "equirectangular"]:
        builder = EquirectangularBuilder(
            angular_resolution=angular_resolution,
            **kwargs,
        )
    elif grid_type_lower == "htm":
        builder = HTMBuilder(
            angular_resolution=angular_resolution,
            **kwargs,
        )
    elif grid_type_lower == "geodesic":
        builder = GeodesicBuilder(
            angular_resolution=angular_resolution,
            **kwargs,
        )
    elif grid_type_lower == "healpix":
        builder = HEALPixBuilder(
            angular_resolution=angular_resolution,
            **kwargs,
        )
    elif grid_type_lower == "fibonacci":
        builder = FibonacciBuilder(
            angular_resolution=angular_resolution,
            **kwargs,
        )
    else:
        raise ValueError(
            f"Unknown grid type: {grid_type}. "
            f"Available types: equal_area, equal_angle, rectangular, "
            f"equirectangular, HTM, geodesic, healpix, fibonacci"
        )

    return builder.build()

Grid Core

Base class for hemisphere grid builders.

BaseGridBuilder

Bases: ABC

Abstract base for hemispherical grid builders.

Parameters

angular_resolution : float Angular resolution in degrees cutoff_theta : float Maximum polar angle cutoff in degrees phi_rotation : float Rotation angle in degrees (applied to all phi values)

Source code in packages/canvod-grids/src/canvod/grids/core/grid_builder.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
class BaseGridBuilder(ABC):
    """Abstract base for hemispherical grid builders.

    Parameters
    ----------
    angular_resolution : float
        Angular resolution in degrees
    cutoff_theta : float
        Maximum polar angle cutoff in degrees
    phi_rotation : float
        Rotation angle in degrees (applied to all phi values)

    """

    def __init__(
        self,
        angular_resolution: float = 2,
        cutoff_theta: float = 0,
        phi_rotation: float = 0,
    ) -> None:
        """Initialize the grid builder.

        Parameters
        ----------
        angular_resolution : float, default 2
            Angular resolution in degrees.
        cutoff_theta : float, default 0
            Maximum polar angle cutoff in degrees.
        phi_rotation : float, default 0
            Rotation angle in degrees.

        """
        self.angular_resolution = angular_resolution
        self.angular_resolution_rad = np.deg2rad(angular_resolution)
        self.cutoff_theta = cutoff_theta
        self.cutoff_theta_rad = np.deg2rad(cutoff_theta)
        self.phi_rotation = phi_rotation
        self.phi_rotation_rad = np.deg2rad(phi_rotation)
        self._logger = _get_logger()

    @abstractmethod
    def _build_grid(
        self,
    ) -> (
        tuple[pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray]]
        | tuple[
            pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray], dict[str, Any]
        ]
    ):
        """Build grid.

        Returns
        -------
        grid : pl.DataFrame
            Grid cells
        theta_lims : np.ndarray
            Theta band limits
        phi_lims : list[np.ndarray]
            Phi limits per band
        cell_ids : list[np.ndarray]
            Cell IDs per band
        extra_kwargs : dict, optional
            Additional metadata

        """

    @abstractmethod
    def get_grid_type(self) -> str:
        """Get grid type identifier."""

    def build(self) -> GridData:
        """Build hemisphere grid.

        Returns
        -------
        GridData
            Complete grid data structure

        """
        self._logger.info(
            "grid_build_started",
            grid_type=self.get_grid_type(),
            angular_resolution=self.angular_resolution,
        )

        result = self._build_grid()

        if len(result) == 4:
            grid, theta_lims, phi_lims, cell_ids = result
            extra_kwargs = {}
        elif len(result) == 5:
            grid, theta_lims, phi_lims, cell_ids, extra_kwargs = result
        else:
            raise ValueError(f"Invalid grid builder result: {len(result)} elements")

        # Apply phi rotation if specified (vectorized operations)
        if self.phi_rotation_rad != 0:
            grid = grid.with_columns(
                [(pl.col("phi") + self.phi_rotation_rad) % (2 * np.pi)]
            )

            if "phi_min" in grid.columns:
                grid = grid.with_columns(
                    [
                        (
                            (pl.col("phi_min") + self.phi_rotation_rad) % (2 * np.pi)
                        ).alias("phi_min"),
                        (
                            (pl.col("phi_max") + self.phi_rotation_rad) % (2 * np.pi)
                        ).alias("phi_max"),
                    ]
                )

        self._logger.info("grid_build_complete", ncells=len(grid))

        return GridData(
            grid=grid,
            theta_lims=theta_lims,
            phi_lims=phi_lims,
            cell_ids=cell_ids,
            grid_type=self.get_grid_type(),
            **extra_kwargs,
        )

__init__(angular_resolution=2, cutoff_theta=0, phi_rotation=0)

Initialize the grid builder.

Parameters

angular_resolution : float, default 2 Angular resolution in degrees. cutoff_theta : float, default 0 Maximum polar angle cutoff in degrees. phi_rotation : float, default 0 Rotation angle in degrees.

Source code in packages/canvod-grids/src/canvod/grids/core/grid_builder.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def __init__(
    self,
    angular_resolution: float = 2,
    cutoff_theta: float = 0,
    phi_rotation: float = 0,
) -> None:
    """Initialize the grid builder.

    Parameters
    ----------
    angular_resolution : float, default 2
        Angular resolution in degrees.
    cutoff_theta : float, default 0
        Maximum polar angle cutoff in degrees.
    phi_rotation : float, default 0
        Rotation angle in degrees.

    """
    self.angular_resolution = angular_resolution
    self.angular_resolution_rad = np.deg2rad(angular_resolution)
    self.cutoff_theta = cutoff_theta
    self.cutoff_theta_rad = np.deg2rad(cutoff_theta)
    self.phi_rotation = phi_rotation
    self.phi_rotation_rad = np.deg2rad(phi_rotation)
    self._logger = _get_logger()

get_grid_type() abstractmethod

Get grid type identifier.

Source code in packages/canvod-grids/src/canvod/grids/core/grid_builder.py
84
85
86
@abstractmethod
def get_grid_type(self) -> str:
    """Get grid type identifier."""

build()

Build hemisphere grid.

Returns

GridData Complete grid data structure

Source code in packages/canvod-grids/src/canvod/grids/core/grid_builder.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def build(self) -> GridData:
    """Build hemisphere grid.

    Returns
    -------
    GridData
        Complete grid data structure

    """
    self._logger.info(
        "grid_build_started",
        grid_type=self.get_grid_type(),
        angular_resolution=self.angular_resolution,
    )

    result = self._build_grid()

    if len(result) == 4:
        grid, theta_lims, phi_lims, cell_ids = result
        extra_kwargs = {}
    elif len(result) == 5:
        grid, theta_lims, phi_lims, cell_ids, extra_kwargs = result
    else:
        raise ValueError(f"Invalid grid builder result: {len(result)} elements")

    # Apply phi rotation if specified (vectorized operations)
    if self.phi_rotation_rad != 0:
        grid = grid.with_columns(
            [(pl.col("phi") + self.phi_rotation_rad) % (2 * np.pi)]
        )

        if "phi_min" in grid.columns:
            grid = grid.with_columns(
                [
                    (
                        (pl.col("phi_min") + self.phi_rotation_rad) % (2 * np.pi)
                    ).alias("phi_min"),
                    (
                        (pl.col("phi_max") + self.phi_rotation_rad) % (2 * np.pi)
                    ).alias("phi_max"),
                ]
            )

    self._logger.info("grid_build_complete", ncells=len(grid))

    return GridData(
        grid=grid,
        theta_lims=theta_lims,
        phi_lims=phi_lims,
        cell_ids=cell_ids,
        grid_type=self.get_grid_type(),
        **extra_kwargs,
    )

Grid data container for hemisphere grids.

GridData dataclass

Immutable container for hemispherical grid structure.

Parameters

grid : pl.DataFrame Grid cells with phi, theta, and bounds theta_lims : np.ndarray Theta band limits phi_lims : list[np.ndarray] Phi limits per theta band cell_ids : list[np.ndarray] Cell IDs per theta band grid_type : str Grid type identifier solid_angles : np.ndarray, optional Solid angles per cell [steradians] metadata : dict, optional Additional grid metadata voronoi : Any, optional Voronoi tessellation object (for Fibonacci grids) vertices : np.ndarray, optional 3D vertices (for triangular grids) points_xyz : np.ndarray, optional 3D point cloud (for Fibonacci grids) vertex_phi : np.ndarray, optional Vertex phi coordinates vertex_theta : np.ndarray, optional Vertex theta coordinates

Source code in packages/canvod-grids/src/canvod/grids/core/grid_data.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
@dataclass(frozen=True)
class GridData:
    """Immutable container for hemispherical grid structure.

    Parameters
    ----------
    grid : pl.DataFrame
        Grid cells with phi, theta, and bounds
    theta_lims : np.ndarray
        Theta band limits
    phi_lims : list[np.ndarray]
        Phi limits per theta band
    cell_ids : list[np.ndarray]
        Cell IDs per theta band
    grid_type : str
        Grid type identifier
    solid_angles : np.ndarray, optional
        Solid angles per cell [steradians]
    metadata : dict, optional
        Additional grid metadata
    voronoi : Any, optional
        Voronoi tessellation object (for Fibonacci grids)
    vertices : np.ndarray, optional
        3D vertices (for triangular grids)
    points_xyz : np.ndarray, optional
        3D point cloud (for Fibonacci grids)
    vertex_phi : np.ndarray, optional
        Vertex phi coordinates
    vertex_theta : np.ndarray, optional
        Vertex theta coordinates

    """

    grid: pl.DataFrame
    theta_lims: np.ndarray
    phi_lims: list[np.ndarray]
    cell_ids: list[np.ndarray]
    grid_type: str
    solid_angles: np.ndarray | None = None
    metadata: dict | None = None
    voronoi: Any | None = None
    vertices: np.ndarray | None = None
    points_xyz: np.ndarray | None = None
    vertex_phi: np.ndarray | None = None
    vertex_theta: np.ndarray | None = None

    @property
    def coords(self) -> pl.DataFrame:
        """Get cell coordinates."""
        return self.grid.select(["phi", "theta"])

    @property
    def ncells(self) -> int:
        """Number of cells in grid."""
        return len(self.grid)

    def get_patches(self) -> pl.Series:
        """Create matplotlib patches for polar visualization."""
        patches = [
            Rectangle(
                (row["phi_min"], row["theta_min"]),
                row["phi_max"] - row["phi_min"],
                row["theta_max"] - row["theta_min"],
                fill=True,
            )
            for row in self.grid.iter_rows(named=True)
        ]
        return pl.Series("Patches", patches)

    def get_solid_angles(self) -> np.ndarray:
        """Calculate solid angle for each cell [steradians]."""
        if self.solid_angles is not None:
            return self.solid_angles

        # HEALPix
        if self.grid_type == "healpix" and "healpix_nside" in self.grid.columns:
            try:
                import healpy as hp

                nside = int(self.grid["healpix_nside"][0])
                return np.full(
                    len(self.grid), hp.nside2pixarea(nside), dtype=np.float64
                )
            except ImportError:
                pass

        # Geodesic
        if self.grid_type == "geodesic" and "geodesic_vertices" in self.grid.columns:
            return self._compute_geodesic_solid_angles()

        # HTM
        if self.grid_type == "htm" and "htm_vertex_0" in self.grid.columns:
            return self._compute_htm_solid_angles()

        # Fibonacci
        if self.grid_type == "fibonacci" and "voronoi_region" in self.grid.columns:
            return self._compute_voronoi_solid_angles()

        # Default
        return self._geometric_solid_angles()

    def _compute_htm_solid_angles(self) -> np.ndarray:
        """Compute solid angles for HTM triangular cells."""
        solid_angles = []

        for row in self.grid.iter_rows(named=True):
            v0 = np.array(row["htm_vertex_0"])
            v1 = np.array(row["htm_vertex_1"])
            v2 = np.array(row["htm_vertex_2"])

            # Spherical excess formula
            a = np.arccos(np.clip(np.dot(v1, v2), -1, 1))
            b = np.arccos(np.clip(np.dot(v0, v2), -1, 1))
            c = np.arccos(np.clip(np.dot(v0, v1), -1, 1))

            s = (a + b + c) / 2
            tan_E_4 = np.sqrt(
                np.tan(s / 2)
                * np.tan((s - a) / 2)
                * np.tan((s - b) / 2)
                * np.tan((s - c) / 2)
            )
            E = 4 * np.arctan(tan_E_4)

            solid_angles.append(E)

        return np.array(solid_angles)

    def _compute_geodesic_solid_angles(self) -> np.ndarray:
        """Compute solid angles for geodesic triangular cells."""
        vertices = self.vertices
        if vertices is None:
            return self._geometric_solid_angles()

        solid_angles = []
        for row in self.grid.iter_rows(named=True):
            v_indices = row["geodesic_vertices"]
            v0, v1, v2 = vertices[v_indices]

            a = np.arccos(np.clip(np.dot(v1, v2), -1, 1))
            b = np.arccos(np.clip(np.dot(v0, v2), -1, 1))
            c = np.arccos(np.clip(np.dot(v0, v1), -1, 1))

            s = (a + b + c) / 2
            tan_E_4 = np.sqrt(
                np.tan(s / 2)
                * np.tan((s - a) / 2)
                * np.tan((s - b) / 2)
                * np.tan((s - c) / 2)
            )
            E = 4 * np.arctan(tan_E_4)

            solid_angles.append(E)

        return np.array(solid_angles)

    def _compute_voronoi_solid_angles(self) -> np.ndarray:
        """Compute solid angles for Voronoi cells."""
        if self.voronoi is None:
            return self._geometric_solid_angles()

        sv = self.voronoi
        solid_angles = []
        for row in self.grid.iter_rows(named=True):
            region = row["voronoi_region"]
            if len(region) < 3:
                solid_angles.append(np.nan)
                continue

            vertices = sv.vertices[region]
            center = np.array(
                [
                    np.sin(row["theta"]) * np.cos(row["phi"]),
                    np.sin(row["theta"]) * np.sin(row["phi"]),
                    np.cos(row["theta"]),
                ]
            )

            total_angle = 0
            n = len(vertices)
            for i in range(n):
                v1 = vertices[i]
                v2 = vertices[(i + 1) % n]
                a = np.arccos(np.clip(np.dot(center, v1), -1, 1))
                b = np.arccos(np.clip(np.dot(center, v2), -1, 1))
                c = np.arccos(np.clip(np.dot(v1, v2), -1, 1))
                s = (a + b + c) / 2
                tan_E_4 = np.sqrt(
                    np.tan(s / 2)
                    * np.tan((s - a) / 2)
                    * np.tan((s - b) / 2)
                    * np.tan((s - c) / 2)
                )
                E = 4 * np.arctan(tan_E_4)
                total_angle += E
            solid_angles.append(total_angle)

        return np.array(solid_angles)

    def _geometric_solid_angles(self) -> np.ndarray:
        """Fallback geometric calculation."""
        solid_angles = []
        for row in self.grid.iter_rows(named=True):
            delta_phi = row["phi_max"] - row["phi_min"]
            cos_diff = np.cos(row["theta_min"]) - np.cos(row["theta_max"])
            omega = delta_phi * cos_diff
            solid_angles.append(omega)
        return np.array(solid_angles)

    def get_grid_stats(self) -> dict:
        """Get grid statistics including solid angle uniformity."""
        solid_angles = self.get_solid_angles()

        stats = {
            "total_cells": self.ncells,
            "grid_type": self.grid_type,
            "theta_bands": len(self.theta_lims),
            "cells_per_band": [len(ids) for ids in self.cell_ids],
            "solid_angle_mean_sr": float(np.mean(solid_angles)),
            "solid_angle_std_sr": float(np.std(solid_angles)),
            "solid_angle_cv_percent": float(
                np.std(solid_angles) / np.mean(solid_angles) * 100
            ),
            "total_solid_angle_sr": float(np.sum(solid_angles)),
            "hemisphere_solid_angle_sr": 2 * np.pi,
        }

        # Add HEALPix-specific info
        if self.grid_type == "healpix" and "healpix_nside" in self.grid.columns:
            try:
                import healpy as hp

                nside = int(self.grid["healpix_nside"][0])
                stats["healpix_nside"] = nside
                stats["healpix_npix_total"] = hp.nside2npix(nside)
                stats["healpix_pixel_area_sr"] = hp.nside2pixarea(nside)
                stats["healpix_resolution_arcmin"] = hp.nside2resol(nside, arcmin=True)
            except ImportError:
                pass

        return stats

coords property

Get cell coordinates.

ncells property

Number of cells in grid.

get_patches()

Create matplotlib patches for polar visualization.

Source code in packages/canvod-grids/src/canvod/grids/core/grid_data.py
67
68
69
70
71
72
73
74
75
76
77
78
def get_patches(self) -> pl.Series:
    """Create matplotlib patches for polar visualization."""
    patches = [
        Rectangle(
            (row["phi_min"], row["theta_min"]),
            row["phi_max"] - row["phi_min"],
            row["theta_max"] - row["theta_min"],
            fill=True,
        )
        for row in self.grid.iter_rows(named=True)
    ]
    return pl.Series("Patches", patches)

get_solid_angles()

Calculate solid angle for each cell [steradians].

Source code in packages/canvod-grids/src/canvod/grids/core/grid_data.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def get_solid_angles(self) -> np.ndarray:
    """Calculate solid angle for each cell [steradians]."""
    if self.solid_angles is not None:
        return self.solid_angles

    # HEALPix
    if self.grid_type == "healpix" and "healpix_nside" in self.grid.columns:
        try:
            import healpy as hp

            nside = int(self.grid["healpix_nside"][0])
            return np.full(
                len(self.grid), hp.nside2pixarea(nside), dtype=np.float64
            )
        except ImportError:
            pass

    # Geodesic
    if self.grid_type == "geodesic" and "geodesic_vertices" in self.grid.columns:
        return self._compute_geodesic_solid_angles()

    # HTM
    if self.grid_type == "htm" and "htm_vertex_0" in self.grid.columns:
        return self._compute_htm_solid_angles()

    # Fibonacci
    if self.grid_type == "fibonacci" and "voronoi_region" in self.grid.columns:
        return self._compute_voronoi_solid_angles()

    # Default
    return self._geometric_solid_angles()

get_grid_stats()

Get grid statistics including solid angle uniformity.

Source code in packages/canvod-grids/src/canvod/grids/core/grid_data.py
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
def get_grid_stats(self) -> dict:
    """Get grid statistics including solid angle uniformity."""
    solid_angles = self.get_solid_angles()

    stats = {
        "total_cells": self.ncells,
        "grid_type": self.grid_type,
        "theta_bands": len(self.theta_lims),
        "cells_per_band": [len(ids) for ids in self.cell_ids],
        "solid_angle_mean_sr": float(np.mean(solid_angles)),
        "solid_angle_std_sr": float(np.std(solid_angles)),
        "solid_angle_cv_percent": float(
            np.std(solid_angles) / np.mean(solid_angles) * 100
        ),
        "total_solid_angle_sr": float(np.sum(solid_angles)),
        "hemisphere_solid_angle_sr": 2 * np.pi,
    }

    # Add HEALPix-specific info
    if self.grid_type == "healpix" and "healpix_nside" in self.grid.columns:
        try:
            import healpy as hp

            nside = int(self.grid["healpix_nside"][0])
            stats["healpix_nside"] = nside
            stats["healpix_npix_total"] = hp.nside2npix(nside)
            stats["healpix_pixel_area_sr"] = hp.nside2pixarea(nside)
            stats["healpix_resolution_arcmin"] = hp.nside2resol(nside, arcmin=True)
        except ImportError:
            pass

    return stats

Grid type definitions for hemisphere tessellation.

GridType

Bases: Enum

Available grid projection types for hemispherical tessellation.

Source code in packages/canvod-grids/src/canvod/grids/core/grid_types.py
 6
 7
 8
 9
10
11
12
13
14
15
class GridType(Enum):
    """Available grid projection types for hemispherical tessellation."""

    EQUAL_AREA = "equal_area"  # Equal solid angle (ring-based)
    EQUAL_ANGLE = "equal_angle"  # Equal angular spacing
    EQUIRECTANGULAR = "equirectangular"  # Simple rectangular
    HEALPIX = "healpix"  # Hierarchical equal area
    GEODESIC = "geodesic"  # Icosahedral triangular
    FIBONACCI = "fibonacci"  # Golden spiral + Voronoi
    HTM = "htm"  # Hierarchical Triangular Mesh

Grid Builders

Equal-Area Grid

Equal-area grid implementation.

EqualAreaBuilder

Bases: BaseGridBuilder

Equal solid angle tessellation using concentric theta bands.

The hemisphere is divided into annular bands of constant width in theta. Within each band the number of azimuthal (phi) sectors is chosen so that every cell subtends approximately the same solid angle. This is the only grid type that has been validated for scientific use in this codebase.

Coordinate convention (physics / GNSS)

  • phi ∈ [0, 2π) – azimuthal angle from North, clockwise (navigation convention)
  • theta ∈ [0, π/2] – polar angle measured from zenith (0 = straight up, π/2 = horizon)

What angular_resolution means

angular_resolution (degrees) sets the width of each theta band. All bands have this same width Δθ. The azimuthal width of cells varies by band: near the zenith cells are wide in phi; near the horizon they are narrow, so that the solid angle stays constant.

Mathematical construction

  1. Target solid angle per cell is chosen equal to the solid angle of a cap of half-angle Δθ/2::

    Ω_target = 2π (1 − cos(Δθ/2))

  2. Zenith cap – a single cell covers [0, Δθ/2] in theta and the full azimuth [0, 2π).

  3. Theta bands – edges are placed at Δθ/2, 3Δθ/2, 5Δθ/2, … up to π/2 − cutoff_theta. For each band [θ_inner, θ_outer] the band's total solid angle is::

    Ω_band = 2π (cos θ_inner − cos θ_outer)

  4. Phi divisions – the number of sectors in the band is::

    n_phi = round(Ω_band / Ω_target)

Each sector spans Δφ = 2π / n_phi. The cell centre is placed at the geometric midpoint of its (phi, theta) rectangle.

Parameters

angular_resolution : float Theta-band width in degrees. Controls both the radial resolution and (indirectly, via the equal-area constraint) the azimuthal resolution. cutoff_theta : float Minimum elevation above the horizon in degrees. Bands whose outer edge is at or below this cutoff are omitted. In GNSS terms this is the satellite elevation mask angle. phi_rotation : float Rigid rotation applied to all phi coordinates after grid construction, in degrees.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/equal_area_grid.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
class EqualAreaBuilder(BaseGridBuilder):
    """Equal solid angle tessellation using concentric theta bands.

    The hemisphere is divided into annular bands of constant width in theta.
    Within each band the number of azimuthal (phi) sectors is chosen so that
    every cell subtends approximately the same solid angle.  This is the only
    grid type that has been validated for scientific use in this codebase.

    Coordinate convention (physics / GNSS)
    ---------------------------------------
    * phi  ∈ [0, 2π)  – azimuthal angle from North, clockwise (navigation convention)
    * theta ∈ [0, π/2] – polar angle measured from zenith (0 = straight up,
      π/2 = horizon)

    What ``angular_resolution`` means
    ----------------------------------
    ``angular_resolution`` (degrees) sets the **width of each theta band**.
    All bands have this same width Δθ.  The *azimuthal* width of cells varies
    by band: near the zenith cells are wide in phi; near the horizon they are
    narrow, so that the solid angle stays constant.

    Mathematical construction
    -------------------------
    1. **Target solid angle** per cell is chosen equal to the solid angle of a
       cap of half-angle Δθ/2::

           Ω_target = 2π (1 − cos(Δθ/2))

    2. **Zenith cap** – a single cell covers [0, Δθ/2] in theta and the full
       azimuth [0, 2π).

    3. **Theta bands** – edges are placed at Δθ/2, 3Δθ/2, 5Δθ/2, … up to
       π/2 − cutoff_theta.  For each band [θ_inner, θ_outer] the band's
       total solid angle is::

           Ω_band = 2π (cos θ_inner − cos θ_outer)

    4. **Phi divisions** – the number of sectors in the band is::

           n_phi = round(Ω_band / Ω_target)

       Each sector spans Δφ = 2π / n_phi.  The cell centre is placed at the
       geometric midpoint of its (phi, theta) rectangle.

    Parameters
    ----------
    angular_resolution : float
        Theta-band width in degrees.  Controls both the radial resolution and
        (indirectly, via the equal-area constraint) the azimuthal resolution.
    cutoff_theta : float
        Minimum elevation above the horizon in degrees.  Bands whose outer
        edge is at or below this cutoff are omitted.  In GNSS terms this is
        the satellite elevation mask angle.
    phi_rotation : float
        Rigid rotation applied to all phi coordinates after grid construction,
        in degrees.

    """

    def get_grid_type(self) -> str:
        """Return the grid-type identifier string.

        Returns
        -------
        str
            ``"equal_area"``

        """
        return GridType.EQUAL_AREA.value

    def _build_grid(
        self,
    ) -> tuple[pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray]]:
        """Construct the equal-area hemisphere grid.

        Returns
        -------
        grid : pl.DataFrame
            One row per cell with columns: phi, theta, phi_min, phi_max,
            theta_min, theta_max, cell_id.
        theta_lims : np.ndarray
            Outer theta edge of each band (radians).
        phi_lims : list[np.ndarray]
            Array of phi_min values for each band.
        cell_ids : list[np.ndarray]
            Cell-id arrays, one per band.

        """
        # Theta band edges (from zenith to horizon)
        max_theta = np.pi / 2  # horizon
        theta_edges = np.arange(
            self.angular_resolution_rad / 2,
            max_theta - self.cutoff_theta_rad,
            self.angular_resolution_rad,
        )

        # Target solid angle per cell
        target_omega = 2 * np.pi * (1 - np.cos(self.angular_resolution_rad / 2))

        cells = []
        theta_lims = []
        phi_lims = []
        cell_ids = []

        # Zenith cell (special case) - only if cutoff allows
        next_cell_id = 0
        zenith_theta_max = self.angular_resolution_rad / 2

        if self.cutoff_theta_rad < zenith_theta_max:
            cells.append(
                pl.DataFrame(
                    {
                        "phi": [0.0],
                        "theta": [0.0],
                        "phi_min": [0.0],
                        "phi_max": [2 * np.pi],
                        "theta_min": [max(0.0, self.cutoff_theta_rad)],
                        "theta_max": [zenith_theta_max],
                    }
                )
            )
            theta_lims.append(zenith_theta_max)
            phi_lims.append(np.array([0.0]))
            cell_ids.append(np.array([0]))
            next_cell_id = 1

        # Build theta bands
        for iband, theta_outer in enumerate(theta_edges[1:]):
            theta_inner = theta_edges[iband]

            # Skip bands below cutoff
            if theta_outer <= self.cutoff_theta_rad:
                continue

            # Solid angle of this band
            band_omega = 2 * np.pi * (np.cos(theta_inner) - np.cos(theta_outer))

            # Number of phi divisions
            n_phi = max(1, round(band_omega / target_omega))
            phi_span = 2 * np.pi / n_phi

            cell_id_list = list(range(next_cell_id, next_cell_id + n_phi))
            next_cell_id = cell_id_list[-1] + 1

            # Use arange for better precision than linspace
            phi_min_arr = np.arange(n_phi) * phi_span
            phi_max_arr = (np.arange(n_phi) + 1) * phi_span
            phi_max_arr[-1] = 2 * np.pi  # Force exact closure

            cells.append(
                pl.DataFrame(
                    {
                        "phi": (phi_min_arr + phi_max_arr) / 2,
                        "theta": np.full(n_phi, (theta_inner + theta_outer) / 2),
                        "phi_min": phi_min_arr,
                        "phi_max": phi_max_arr,
                        "theta_min": np.full(n_phi, theta_inner),
                        "theta_max": np.full(n_phi, theta_outer),
                    }
                )
            )

            theta_lims.append(theta_outer)
            phi_lims.append(phi_min_arr)
            cell_ids.append(np.array(cell_id_list))

        if len(cells) == 0:
            raise ValueError(
                "No cells generated - check cutoff_theta and angular_resolution"
            )

        grid = pl.concat(cells).with_columns(pl.int_range(0, pl.len()).alias("cell_id"))

        return grid, np.array(theta_lims), phi_lims, cell_ids

get_grid_type()

Return the grid-type identifier string.

Returns

str "equal_area"

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/equal_area_grid.py
68
69
70
71
72
73
74
75
76
77
def get_grid_type(self) -> str:
    """Return the grid-type identifier string.

    Returns
    -------
    str
        ``"equal_area"``

    """
    return GridType.EQUAL_AREA.value

Equal-Angle Grid

Equal-angle grid implementation.

EqualAngleBuilder

Bases: BaseGridBuilder

Equal angular spacing in both theta and phi (NOT equal area).

Every cell is a rectangle of the same angular size Δθ × Δφ in the (theta, phi) parameter space. Because solid angle depends on cos(theta), cells near the zenith subtend more solid angle than cells near the horizon. This makes the grid biased toward the zenith for any solid-angle-weighted statistic. Not recommended for scientific analysis – use EqualAreaBuilder instead.

Coordinate convention (physics / GNSS)

  • phi ∈ [0, 2π) – azimuthal angle from North, clockwise (navigation convention)
  • theta ∈ [0, π/2] – polar angle from zenith

What angular_resolution means

angular_resolution (degrees) is used as both the theta-band width and the phi-sector width. The number of phi divisions is constant across all bands::

n_phi = round(2π / Δθ)

and does not change with latitude.

Mathematical construction

  1. A zenith cap cell covers [0, Δθ/2] × [0, 2π).
  2. Theta band edges are placed at Δθ/2, 3Δθ/2, … up to π/2.
  3. Within every band, the full azimuth is split into n_phi sectors of equal width Δφ = 2π / n_phi.
  4. Cell centres are at the midpoint of each (phi, theta) rectangle.

Parameters

angular_resolution : float Angular spacing in degrees, applied identically in both theta and phi. cutoff_theta : float Elevation mask angle in degrees (bands below this are omitted). phi_rotation : float Rigid azimuthal rotation applied after construction, in degrees.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/equal_angle_grid.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
class EqualAngleBuilder(BaseGridBuilder):
    """Equal angular spacing in both theta and phi (NOT equal area).

    Every cell is a rectangle of the same angular size Δθ × Δφ in the
    (theta, phi) parameter space.  Because solid angle depends on cos(theta),
    cells near the zenith subtend *more* solid angle than cells near the
    horizon.  This makes the grid biased toward the zenith for any
    solid-angle-weighted statistic.  **Not recommended for scientific
    analysis** – use ``EqualAreaBuilder`` instead.

    Coordinate convention (physics / GNSS)
    ---------------------------------------
    * phi  ∈ [0, 2π)  – azimuthal angle from North, clockwise (navigation convention)
    * theta ∈ [0, π/2] – polar angle from zenith

    What ``angular_resolution`` means
    ----------------------------------
    ``angular_resolution`` (degrees) is used as **both** the theta-band width
    and the phi-sector width.  The number of phi divisions is constant across
    all bands::

        n_phi = round(2π / Δθ)

    and does not change with latitude.

    Mathematical construction
    -------------------------
    1. A zenith cap cell covers [0, Δθ/2] × [0, 2π).
    2. Theta band edges are placed at Δθ/2, 3Δθ/2, … up to π/2.
    3. Within every band, the full azimuth is split into ``n_phi`` sectors of
       equal width Δφ = 2π / n_phi.
    4. Cell centres are at the midpoint of each (phi, theta) rectangle.

    Parameters
    ----------
    angular_resolution : float
        Angular spacing in degrees, applied identically in both theta and phi.
    cutoff_theta : float
        Elevation mask angle in degrees (bands below this are omitted).
    phi_rotation : float
        Rigid azimuthal rotation applied after construction, in degrees.

    """

    def get_grid_type(self) -> str:
        """Return the grid-type identifier string.

        Returns
        -------
        str
            ``"equal_angle"``

        """
        return GridType.EQUAL_ANGLE.value

    def _build_grid(
        self,
    ) -> tuple[pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray]]:
        """Construct the equal-angle hemisphere grid.

        Returns
        -------
        grid : pl.DataFrame
            One row per cell.
        theta_lims : np.ndarray
            Outer theta edge of each band (radians).
        phi_lims : list[np.ndarray]
            Array of phi_min values for each band (identical across bands).
        cell_ids : list[np.ndarray]
            Cell-id arrays, one per band.

        """
        max_theta = np.pi / 2
        theta_edges = np.arange(
            self.angular_resolution_rad / 2,
            max_theta - self.cutoff_theta_rad,
            self.angular_resolution_rad,
        )

        n_phi_divisions = int(2 * np.pi / self.angular_resolution_rad)

        cells = []
        theta_lims = []
        phi_lims = []
        cell_ids = []

        # Zenith
        cells.append(
            pl.DataFrame(
                {
                    "phi": [0.0],
                    "theta": [0.0],
                    "phi_min": [0.0],
                    "phi_max": [2 * np.pi],
                    "theta_min": [0.0],
                    "theta_max": [self.angular_resolution_rad / 2],
                }
            )
        )
        theta_lims.append(self.angular_resolution_rad / 2)
        phi_lims.append(np.array([0.0]))
        cell_ids.append(np.array([0]))
        next_cell_id = 1

        for iband, theta_outer in enumerate(theta_edges[1:]):
            theta_inner = theta_edges[iband]
            phi_span = 2 * np.pi / n_phi_divisions

            cell_id_list = list(range(next_cell_id, next_cell_id + n_phi_divisions))
            next_cell_id = cell_id_list[-1] + 1

            phi_min_arr = np.linspace(0, 2 * np.pi - phi_span, n_phi_divisions)
            phi_max_arr = np.concatenate((phi_min_arr[1:], [2 * np.pi]))

            cells.append(
                pl.DataFrame(
                    {
                        "phi": (phi_min_arr + phi_max_arr) / 2,
                        "theta": np.full(
                            n_phi_divisions,
                            (theta_inner + theta_outer) / 2,
                        ),
                        "phi_min": phi_min_arr,
                        "phi_max": phi_max_arr,
                        "theta_min": np.full(n_phi_divisions, theta_inner),
                        "theta_max": np.full(n_phi_divisions, theta_outer),
                    }
                )
            )

            theta_lims.append(theta_outer)
            phi_lims.append(phi_min_arr)
            cell_ids.append(np.array(cell_id_list))

        grid = pl.concat(cells).with_row_index("cell_id")
        return grid, np.array(theta_lims), phi_lims, cell_ids

get_grid_type()

Return the grid-type identifier string.

Returns

str "equal_angle"

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/equal_angle_grid.py
53
54
55
56
57
58
59
60
61
62
def get_grid_type(self) -> str:
    """Return the grid-type identifier string.

    Returns
    -------
    str
        ``"equal_angle"``

    """
    return GridType.EQUAL_ANGLE.value

Equirectangular Grid

Equirectangular grid implementation.

EquirectangularBuilder

Bases: BaseGridBuilder

Simple rectangular grid in (theta, phi) space.

The hemisphere is divided into a regular rectangular array: a constant number of theta bands, each containing the same constant number of phi sectors. Every cell is an identical rectangle in angular coordinates. This is structurally identical to EqualAngleBuilder except for one difference in the zenith treatment: EqualAngleBuilder collapses the first band into a single zenith cap, while this builder does not — every band has the same number of sectors.

Because solid angle depends on cos(theta), cells near the zenith subtend more solid angle than cells near the horizon. This makes the grid biased toward the zenith for any solid-angle-weighted statistic. Not recommended for scientific analysis – use EqualAreaBuilder instead.

Coordinate convention (physics / GNSS)

  • phi ∈ [0, 2π) – azimuthal angle from North, clockwise (navigation convention)
  • theta ∈ [0, π/2] – polar angle from zenith (0 = straight up, π/2 = horizon)

What angular_resolution means

angular_resolution (degrees) is used as both the theta-band width and the phi-sector width. The grid is therefore square in angular coordinates::

n_theta = round((π/2 − cutoff) / Δθ)
n_phi   = round(2π / Δθ)
total cells = n_theta × n_phi

Mathematical construction

  1. Theta edges are placed at cutoff_theta, cutoff_theta + Δθ, cutoff_theta + 2Δθ, … up to π/2.
  2. Phi edges are placed at 0, Δθ, 2Δθ, … up to 2π.
  3. Every (theta_band, phi_sector) combination produces one cell. The cell centre is the midpoint of the rectangle.
  4. No special zenith cap is created; the band nearest the zenith has the same number of phi sectors as all other bands.

Parameters

angular_resolution : float Angular spacing in degrees, applied identically in both theta and phi. cutoff_theta : float Elevation mask angle in degrees. Bands whose inner edge is at or below π/2 − cutoff_theta are omitted. phi_rotation : float Rigid azimuthal rotation applied after construction, in degrees.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/equirectangular_grid.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
class EquirectangularBuilder(BaseGridBuilder):
    """Simple rectangular grid in (theta, phi) space.

    The hemisphere is divided into a regular rectangular array: a constant
    number of theta bands, each containing the same constant number of phi
    sectors.  Every cell is an identical rectangle in angular coordinates.
    This is *structurally* identical to ``EqualAngleBuilder`` except for one
    difference in the zenith treatment: ``EqualAngleBuilder`` collapses the
    first band into a single zenith cap, while this builder does not — every
    band has the same number of sectors.

    Because solid angle depends on cos(theta), cells near the zenith subtend
    *more* solid angle than cells near the horizon.  This makes the grid
    biased toward the zenith for any solid-angle-weighted statistic.
    **Not recommended for scientific analysis** – use ``EqualAreaBuilder``
    instead.

    Coordinate convention (physics / GNSS)
    ---------------------------------------
    * phi  ∈ [0, 2π)  – azimuthal angle from North, clockwise (navigation convention)
    * theta ∈ [0, π/2] – polar angle from zenith (0 = straight up,
      π/2 = horizon)

    What ``angular_resolution`` means
    ----------------------------------
    ``angular_resolution`` (degrees) is used as **both** the theta-band width
    *and* the phi-sector width.  The grid is therefore square in angular
    coordinates::

        n_theta = round((π/2 − cutoff) / Δθ)
        n_phi   = round(2π / Δθ)
        total cells = n_theta × n_phi

    Mathematical construction
    -------------------------
    1. Theta edges are placed at ``cutoff_theta``, ``cutoff_theta + Δθ``,
       ``cutoff_theta + 2Δθ``, … up to π/2.
    2. Phi edges are placed at 0, Δθ, 2Δθ, … up to 2π.
    3. Every (theta_band, phi_sector) combination produces one cell.  The
       cell centre is the midpoint of the rectangle.
    4. No special zenith cap is created; the band nearest the zenith has
       the same number of phi sectors as all other bands.

    Parameters
    ----------
    angular_resolution : float
        Angular spacing in degrees, applied identically in both theta and phi.
    cutoff_theta : float
        Elevation mask angle in degrees.  Bands whose *inner* edge is at or
        below ``π/2 − cutoff_theta`` are omitted.
    phi_rotation : float
        Rigid azimuthal rotation applied after construction, in degrees.

    """

    def get_grid_type(self) -> str:
        """Return the grid-type identifier string.

        Returns
        -------
        str
            ``"equirectangular"``

        """
        return GridType.EQUIRECTANGULAR.value

    def _build_grid(
        self,
    ) -> tuple[pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray]]:
        """Construct the equirectangular hemisphere grid.

        Returns
        -------
        grid : pl.DataFrame
            One row per cell with columns: phi, theta, phi_min, phi_max,
            theta_min, theta_max, cell_id.
        theta_lims : np.ndarray
            Inner theta edge of each band (radians).
        phi_lims : list[np.ndarray]
            Array of phi_min values for each band (identical across bands).
        cell_ids : list[np.ndarray]
            Cell-id arrays, one per band.

        """
        max_theta = np.pi / 2

        theta_edges = np.arange(
            self.cutoff_theta_rad,
            max_theta + self.angular_resolution_rad,
            self.angular_resolution_rad,
        )
        phi_edges = np.arange(
            0, 2 * np.pi + self.angular_resolution_rad, self.angular_resolution_rad
        )

        cells = []

        for i in range(len(theta_edges) - 1):
            theta_min, theta_max = theta_edges[i], theta_edges[i + 1]

            for j in range(len(phi_edges) - 1):
                phi_min, phi_max = phi_edges[j], phi_edges[j + 1]

                cells.append(
                    {
                        "phi": (phi_min + phi_max) / 2,
                        "theta": (theta_min + theta_max) / 2,
                        "phi_min": phi_min,
                        "phi_max": min(2 * np.pi, phi_max),
                        "theta_min": theta_min,
                        "theta_max": theta_max,
                    }
                )

        grid = pl.DataFrame(cells).with_columns(
            pl.int_range(0, pl.len()).alias("cell_id")
        )

        theta_lims = theta_edges[:-1]
        phi_lims = [phi_edges[:-1] for _ in range(len(theta_edges) - 1)]
        cell_ids_list = [
            np.arange(i * (len(phi_edges) - 1), (i + 1) * (len(phi_edges) - 1))
            for i in range(len(theta_edges) - 1)
        ]

        return grid, theta_lims, phi_lims, cell_ids_list

get_grid_type()

Return the grid-type identifier string.

Returns

str "equirectangular"

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/equirectangular_grid.py
64
65
66
67
68
69
70
71
72
73
def get_grid_type(self) -> str:
    """Return the grid-type identifier string.

    Returns
    -------
    str
        ``"equirectangular"``

    """
    return GridType.EQUIRECTANGULAR.value

HEALPix Grid

HEALPix grid implementation.

HEALPixBuilder

Bases: BaseGridBuilder

HEALPix tessellation (Hierarchical Equal Area isoLatitude Pixelization).

HEALPix partitions the sphere into 12 base pixels arranged at equal latitudes. Each base pixel is recursively subdivided into 4 children, producing 12 × nside² pixels on the full sphere, all with exactly the same solid angle. This strict equal-area property makes HEALPix the gold standard for pixelisations that must be unbiased under solid-angle weighting.

This builder delegates the pixel geometry entirely to the healpy library. It filters the full-sphere pixelisation down to the northern hemisphere and stores approximate bounding boxes (phi_min/max, theta_min/max) derived from the pixel resolution. The bounding boxes are not the true pixel boundaries (which are curvilinear); they are only approximations suitable for quick spatial queries. For exact pixel membership use healpy.ang2pix directly.

Coordinate convention

HEALPix natively uses colatitude theta ∈ [0, π] (0 = North Pole) and longitude phi ∈ [0, 2π). This matches the GNSS convention used elsewhere in canvodpy: theta = 0 is the zenith, theta = π/2 is the horizon. No coordinate transform is applied.

What nside (resolution) means

nside is the single resolution parameter of HEALPix. It must be a power of 2. The key derived quantities are::

n_pixels   = 12 × nside²           (full sphere)
pixel_area = 4π / n_pixels          (steradians, exact)
resolution ≈ √(pixel_area)          (approximate angular diameter)
           ≈ 58.6° / nside         (degrees)
nside Pixels (full) Approx resolution Pixel area (sr)
1 12 58.6° 1.049
2 48 29.3° 0.262
4 192 14.7° 0.065
8 768 7.3° 0.016
16 3 072 3.7° 0.004
32 12 288 1.8° 0.001

When nside is not provided, it is estimated from angular_resolution and rounded to the nearest power of 2::

nside_estimate = round_to_pow2( √(3/π) × 60 / angular_resolution )

Mathematical construction

HEALPix construction is performed entirely by healpy. At a high level:

  1. The sphere is divided into 12 congruent base pixels (a curvilinear quadrilateral arrangement at three latitude zones: polar caps and equatorial belt).
  2. Each base pixel is subdivided into nside² equal-area children using a hierarchical quadtree.
  3. Pixel centres are returned by healpy.pix2ang(nside, ipix) in RING ordering (pixels ordered by increasing colatitude).
  4. This builder keeps only pixels with theta ≤ π/2 − cutoff_theta (northern hemisphere above the elevation mask).

Parameters

angular_resolution : float Approximate angular resolution in degrees. Used only to derive nside when that parameter is not given explicitly. cutoff_theta : float Elevation mask angle in degrees. Pixels with colatitude theta > π/2 − cutoff_theta (i.e. below the mask) are excluded. nside : int or None HEALPix resolution parameter. Must be a power of 2. If None, estimated from angular_resolution. phi_rotation : float Rigid azimuthal rotation applied after construction, in degrees.

Raises

ImportError If healpy is not installed. ValueError If nside is not a power of 2.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/healpix_grid.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
class HEALPixBuilder(BaseGridBuilder):
    """HEALPix tessellation (Hierarchical Equal Area isoLatitude Pixelization).

    HEALPix partitions the sphere into 12 base pixels arranged at equal
    latitudes.  Each base pixel is recursively subdivided into 4 children,
    producing ``12 × nside²`` pixels on the full sphere, all with *exactly*
    the same solid angle.  This strict equal-area property makes HEALPix
    the gold standard for pixelisations that must be unbiased under
    solid-angle weighting.

    This builder delegates the pixel geometry entirely to the ``healpy``
    library.  It filters the full-sphere pixelisation down to the northern
    hemisphere and stores approximate bounding boxes (``phi_min/max``,
    ``theta_min/max``) derived from the pixel resolution.  The bounding
    boxes are **not** the true pixel boundaries (which are curvilinear);
    they are only approximations suitable for quick spatial queries.  For
    exact pixel membership use ``healpy.ang2pix`` directly.

    Coordinate convention
    ---------------------
    HEALPix natively uses colatitude ``theta ∈ [0, π]`` (0 = North Pole)
    and longitude ``phi ∈ [0, 2π)``.  This matches the GNSS convention used
    elsewhere in canvodpy: theta = 0 is the zenith, theta = π/2 is the
    horizon.  **No coordinate transform is applied.**

    What ``nside`` (resolution) means
    ----------------------------------
    ``nside`` is the single resolution parameter of HEALPix.  It must be a
    power of 2.  The key derived quantities are::

        n_pixels   = 12 × nside²           (full sphere)
        pixel_area = 4π / n_pixels          (steradians, exact)
        resolution ≈ √(pixel_area)          (approximate angular diameter)
                   ≈ 58.6° / nside         (degrees)

    | nside | Pixels (full) | Approx resolution | Pixel area (sr) |
    |-------|---------------|-------------------|-----------------|
    | 1     | 12            | 58.6°             | 1.049           |
    | 2     | 48            | 29.3°             | 0.262           |
    | 4     | 192           | 14.7°             | 0.065           |
    | 8     | 768           | 7.3°              | 0.016           |
    | 16    | 3 072         | 3.7°              | 0.004           |
    | 32    | 12 288        | 1.8°              | 0.001           |

    When ``nside`` is not provided, it is estimated from ``angular_resolution``
    and rounded to the nearest power of 2::

        nside_estimate = round_to_pow2( √(3/π) × 60 / angular_resolution )

    Mathematical construction
    -------------------------
    HEALPix construction is performed entirely by ``healpy``.  At a high
    level:

    1. The sphere is divided into 12 congruent base pixels (a curvilinear
       quadrilateral arrangement at three latitude zones: polar caps and
       equatorial belt).
    2. Each base pixel is subdivided into ``nside²`` equal-area children
       using a hierarchical quadtree.
    3. Pixel centres are returned by ``healpy.pix2ang(nside, ipix)`` in
       RING ordering (pixels ordered by increasing colatitude).
    4. This builder keeps only pixels with ``theta ≤ π/2 − cutoff_theta``
       (northern hemisphere above the elevation mask).

    Parameters
    ----------
    angular_resolution : float
        Approximate angular resolution in degrees.  Used only to derive
        ``nside`` when that parameter is not given explicitly.
    cutoff_theta : float
        Elevation mask angle in degrees.  Pixels with colatitude
        ``theta > π/2 − cutoff_theta`` (i.e. below the mask) are excluded.
    nside : int or None
        HEALPix resolution parameter.  Must be a power of 2.  If ``None``,
        estimated from ``angular_resolution``.
    phi_rotation : float
        Rigid azimuthal rotation applied after construction, in degrees.

    Raises
    ------
    ImportError
        If ``healpy`` is not installed.
    ValueError
        If ``nside`` is not a power of 2.

    """

    def __init__(
        self,
        angular_resolution: float = 2,
        cutoff_theta: float = 0,
        nside: int | None = None,
        phi_rotation: float = 0,
    ) -> None:
        """Initialize the HEALPix grid builder.

        Parameters
        ----------
        angular_resolution : float, default 2
            Angular resolution in degrees.
        cutoff_theta : float, default 0
            Maximum polar angle cutoff in degrees.
        nside : int | None, optional
            HEALPix nside parameter.
        phi_rotation : float, default 0
            Rotation angle in degrees.

        """
        super().__init__(angular_resolution, cutoff_theta, phi_rotation)

        # Determine nside
        if nside is None:
            nside_estimate = int(np.sqrt(3 / np.pi) * 60 / angular_resolution)
            self.nside = 2 ** max(0, int(np.round(np.log2(nside_estimate))))
        else:
            if nside < 1 or (nside & (nside - 1)) != 0:
                raise ValueError(f"nside must be a power of 2, got {nside}")
            self.nside = nside

        # Import healpy
        try:
            import healpy as hp

            self.hp = hp
        except ImportError:
            raise ImportError(
                "healpy is required for HEALPix grid. Install with: pip install healpy"
            )

        pixel_size_arcmin = self.hp.nside2resol(self.nside, arcmin=True)
        self.actual_angular_resolution = pixel_size_arcmin / 60.0

        self._logger.info(
            f"HEALPix: nside={self.nside}, "
            f"requested_res={angular_resolution:.2f}°, "
            f"actual_res={self.actual_angular_resolution:.2f}°"
        )

    def get_grid_type(self) -> str:
        """Return the grid-type identifier string.

        Returns
        -------
        str
            ``"healpix"``

        """
        return GridType.HEALPIX.value

    def _build_grid(
        self,
    ) -> tuple[pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray]]:
        """Build HEALPix grid for the northern hemisphere.

        Iterates over all ``12 × nside²`` pixels, retains those with
        ``theta ≤ π/2 − cutoff_theta``, and constructs approximate
        bounding boxes from the pixel resolution.

        Returns
        -------
        grid : pl.DataFrame
            One row per pixel.  Contains phi, theta (centre), approximate
            bounding-box limits, ``healpix_ipix`` (RING-ordered pixel index),
            and ``healpix_nside``.
        theta_lims : np.ndarray
            Synthetic evenly-spaced theta limits (interface compatibility only).
        phi_lims : list[np.ndarray]
            Synthetic evenly-spaced phi limits (interface compatibility only).
        cell_ids : list[np.ndarray]
            Single-element list containing the valid pixel indices.

        """
        npix = self.hp.nside2npix(self.nside)

        cells = []
        valid_pixels = []

        for ipix in range(npix):
            theta, phi = self.hp.pix2ang(self.nside, ipix)

            # Keep only northern hemisphere above the elevation mask
            if theta > (np.pi / 2 - self.cutoff_theta_rad):
                continue

            pixel_radius = self.hp.nside2resol(self.nside)

            cells.append(
                {
                    "phi": float(phi),
                    "theta": float(theta),
                    "phi_min": float(max(0, phi - pixel_radius / 2)),
                    "phi_max": float(min(2 * np.pi, phi + pixel_radius / 2)),
                    "theta_min": float(max(0, theta - pixel_radius / 2)),
                    "theta_max": float(min(np.pi / 2, theta + pixel_radius / 2)),
                    "healpix_ipix": int(ipix),
                    "healpix_nside": int(self.nside),
                }
            )
            valid_pixels.append(int(ipix))

        if len(cells) == 0:
            raise ValueError("No valid HEALPix pixels found in hemisphere")

        grid = pl.DataFrame(cells)

        grid = grid.with_columns(
            [
                pl.col("healpix_ipix").cast(pl.Int64),
                pl.col("healpix_nside").cast(pl.Int64),
            ]
        )

        theta_unique = sorted(grid["theta"].unique())
        n_theta_bands = len(theta_unique)

        # NOTE: These limits are SYNTHETIC and do NOT correspond to actual
        # HEALPix pixel boundaries. They exist only for interface
        # compatibility with ring-based grids. For spatial queries, use the
        # per-pixel theta_min/max and phi_min/max columns instead.
        theta_lims = np.linspace(0, np.pi / 2, min(n_theta_bands, 20))
        phi_lims = [np.linspace(0, 2 * np.pi, 20) for _ in range(len(theta_lims))]

        cell_ids_list = [np.array(valid_pixels, dtype=np.int64)]

        return grid, theta_lims, phi_lims, cell_ids_list

    def get_healpix_info(self) -> dict:
        """Get HEALPix-specific information.

        Returns
        -------
        info : dict
            Keys: ``nside``, ``npix_total``, ``pixel_area_sr``,
            ``pixel_area_arcmin2``, ``resolution_arcmin``,
            ``resolution_deg``, ``max_pixel_radius_deg``.

        """
        return {
            "nside": self.nside,
            "npix_total": self.hp.nside2npix(self.nside),
            "pixel_area_sr": self.hp.nside2pixarea(self.nside),
            "pixel_area_arcmin2": (
                self.hp.nside2pixarea(self.nside, degrees=True) * 3600
            ),
            "resolution_arcmin": self.hp.nside2resol(self.nside, arcmin=True),
            "resolution_deg": self.actual_angular_resolution,
            "max_pixel_radius_deg": np.rad2deg(self.hp.max_pixrad(self.nside)),
        }

__init__(angular_resolution=2, cutoff_theta=0, nside=None, phi_rotation=0)

Initialize the HEALPix grid builder.

Parameters

angular_resolution : float, default 2 Angular resolution in degrees. cutoff_theta : float, default 0 Maximum polar angle cutoff in degrees. nside : int | None, optional HEALPix nside parameter. phi_rotation : float, default 0 Rotation angle in degrees.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/healpix_grid.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def __init__(
    self,
    angular_resolution: float = 2,
    cutoff_theta: float = 0,
    nside: int | None = None,
    phi_rotation: float = 0,
) -> None:
    """Initialize the HEALPix grid builder.

    Parameters
    ----------
    angular_resolution : float, default 2
        Angular resolution in degrees.
    cutoff_theta : float, default 0
        Maximum polar angle cutoff in degrees.
    nside : int | None, optional
        HEALPix nside parameter.
    phi_rotation : float, default 0
        Rotation angle in degrees.

    """
    super().__init__(angular_resolution, cutoff_theta, phi_rotation)

    # Determine nside
    if nside is None:
        nside_estimate = int(np.sqrt(3 / np.pi) * 60 / angular_resolution)
        self.nside = 2 ** max(0, int(np.round(np.log2(nside_estimate))))
    else:
        if nside < 1 or (nside & (nside - 1)) != 0:
            raise ValueError(f"nside must be a power of 2, got {nside}")
        self.nside = nside

    # Import healpy
    try:
        import healpy as hp

        self.hp = hp
    except ImportError:
        raise ImportError(
            "healpy is required for HEALPix grid. Install with: pip install healpy"
        )

    pixel_size_arcmin = self.hp.nside2resol(self.nside, arcmin=True)
    self.actual_angular_resolution = pixel_size_arcmin / 60.0

    self._logger.info(
        f"HEALPix: nside={self.nside}, "
        f"requested_res={angular_resolution:.2f}°, "
        f"actual_res={self.actual_angular_resolution:.2f}°"
    )

get_grid_type()

Return the grid-type identifier string.

Returns

str "healpix"

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/healpix_grid.py
147
148
149
150
151
152
153
154
155
156
def get_grid_type(self) -> str:
    """Return the grid-type identifier string.

    Returns
    -------
    str
        ``"healpix"``

    """
    return GridType.HEALPIX.value

get_healpix_info()

Get HEALPix-specific information.

Returns

info : dict Keys: nside, npix_total, pixel_area_sr, pixel_area_arcmin2, resolution_arcmin, resolution_deg, max_pixel_radius_deg.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/healpix_grid.py
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
def get_healpix_info(self) -> dict:
    """Get HEALPix-specific information.

    Returns
    -------
    info : dict
        Keys: ``nside``, ``npix_total``, ``pixel_area_sr``,
        ``pixel_area_arcmin2``, ``resolution_arcmin``,
        ``resolution_deg``, ``max_pixel_radius_deg``.

    """
    return {
        "nside": self.nside,
        "npix_total": self.hp.nside2npix(self.nside),
        "pixel_area_sr": self.hp.nside2pixarea(self.nside),
        "pixel_area_arcmin2": (
            self.hp.nside2pixarea(self.nside, degrees=True) * 3600
        ),
        "resolution_arcmin": self.hp.nside2resol(self.nside, arcmin=True),
        "resolution_deg": self.actual_angular_resolution,
        "max_pixel_radius_deg": np.rad2deg(self.hp.max_pixrad(self.nside)),
    }

Geodesic Grid

Geodesic grid implementation.

GeodesicBuilder

Bases: BaseGridBuilder

Geodesic grid based on a subdivided icosahedron.

The sphere is tessellated into triangular cells by starting with an icosahedron (20 equilateral triangles) and recursively subdividing each triangle into four smaller triangles. All vertices are projected back onto the unit sphere after each subdivision step, so the final cells are spherical triangles. The grid has no polar singularity and provides near-uniform cell areas, though strict equal-area is not guaranteed — cell areas vary by a few percent depending on how they inherit the icosahedral symmetry axes.

Coordinate convention (physics / GNSS)

  • phi ∈ [0, 2π) – azimuthal angle from North, clockwise (navigation convention)
  • theta ∈ [0, π/2] – polar angle from zenith (0 = straight up, π/2 = horizon)

Cell centres are computed as the 3D Cartesian mean of the three vertices, re-normalised onto the unit sphere.

What angular_resolution means

angular_resolution is not used directly as a cell size. Instead it is used only when subdivision_level is not explicitly supplied, to estimate an appropriate subdivision level. The heuristic targets an approximate triangle edge length of 2 × angular_resolution::

target_edge ≈ 2 × angular_resolution   (degrees)
subdivision_level = ceil(log₂(63.4 / target_edge))

The number 63.4° is the edge length of a regular icosahedron inscribed in a unit sphere. Each subdivision halves the edge length, so the actual edge length at level n is approximately::

edge ≈ 63.4° / 2ⁿ   (degrees)

The total number of triangles on the full sphere is 20 × 4ⁿ. Roughly half fall in the northern hemisphere (exact count depends on the hemisphere boundary).

Mathematical construction

  1. Icosahedron – 12 vertices placed at the intersections of three mutually perpendicular golden-ratio rectangles, normalised to the unit sphere. 20 triangular faces connect them.
  2. Subdivision – each triangle is split into 4 by inserting edge midpoints. Each midpoint is projected onto the unit sphere (re-normalised) before the next subdivision. This is repeated subdivision_level times.
  3. Hemisphere filter – faces are kept if any of their three vertices satisfies theta ≤ π/2 − cutoff_theta. Consequently, boundary triangles that straddle the horizon are included and extend slightly below it.
  4. Phi wrapping – for triangles that straddle the 0/2π azimuthal boundary, vertex phis below π are shifted by +2π before computing bounding-box limits, then wrapped back.

Parameters

angular_resolution : float Approximate angular resolution in degrees. Used only to derive subdivision_level when that parameter is not given explicitly. cutoff_theta : float Elevation mask angle in degrees. Triangles are excluded only if all their vertices are below this elevation. subdivision_level : int or None Number of icosahedral subdivisions. If None, estimated from angular_resolution. Typical range 0–5. phi_rotation : float Rigid azimuthal rotation applied after construction, in degrees.

Notes

The theta_lims, phi_lims, and cell_ids fields of the returned GridData are synthetic evenly-spaced arrays kept only for interface compatibility with ring-based grids. They do not describe the actual triangular cell layout. Use the geodesic_vertices column and the vertices array in GridData.vertices for the true geometry.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/geodesic_grid.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
class GeodesicBuilder(BaseGridBuilder):
    """Geodesic grid based on a subdivided icosahedron.

    The sphere is tessellated into triangular cells by starting with an
    icosahedron (20 equilateral triangles) and recursively subdividing each
    triangle into four smaller triangles.  All vertices are projected back
    onto the unit sphere after each subdivision step, so the final cells are
    *spherical* triangles.  The grid has no polar singularity and provides
    near-uniform cell areas, though strict equal-area is *not* guaranteed —
    cell areas vary by a few percent depending on how they inherit the
    icosahedral symmetry axes.

    Coordinate convention (physics / GNSS)
    ---------------------------------------
    * phi  ∈ [0, 2π)  – azimuthal angle from North, clockwise (navigation convention)
    * theta ∈ [0, π/2] – polar angle from zenith (0 = straight up,
      π/2 = horizon)

    Cell centres are computed as the 3D Cartesian mean of the three vertices,
    re-normalised onto the unit sphere.

    What ``angular_resolution`` means
    ----------------------------------
    ``angular_resolution`` is **not** used directly as a cell size.  Instead it
    is used only when ``subdivision_level`` is *not* explicitly supplied, to
    *estimate* an appropriate subdivision level.  The heuristic targets an
    approximate triangle edge length of ``2 × angular_resolution``::

        target_edge ≈ 2 × angular_resolution   (degrees)
        subdivision_level = ceil(log₂(63.4 / target_edge))

    The number 63.4° is the edge length of a regular icosahedron inscribed in
    a unit sphere.  Each subdivision halves the edge length, so the actual
    edge length at level *n* is approximately::

        edge ≈ 63.4° / 2ⁿ   (degrees)

    The total number of triangles on the **full sphere** is ``20 × 4ⁿ``.
    Roughly half fall in the northern hemisphere (exact count depends on
    the hemisphere boundary).

    Mathematical construction
    -------------------------
    1. **Icosahedron** – 12 vertices placed at the intersections of three
       mutually perpendicular golden-ratio rectangles, normalised to the
       unit sphere.  20 triangular faces connect them.
    2. **Subdivision** – each triangle is split into 4 by inserting edge
       midpoints.  Each midpoint is projected onto the unit sphere
       (re-normalised) before the next subdivision.  This is repeated
       ``subdivision_level`` times.
    3. **Hemisphere filter** – faces are kept if *any* of their three
       vertices satisfies ``theta ≤ π/2 − cutoff_theta``.  Consequently,
       boundary triangles that straddle the horizon *are* included and
       extend slightly below it.
    4. **Phi wrapping** – for triangles that straddle the 0/2π azimuthal
       boundary, vertex phis below π are shifted by +2π before computing
       bounding-box limits, then wrapped back.

    Parameters
    ----------
    angular_resolution : float
        Approximate angular resolution in degrees.  Used only to derive
        ``subdivision_level`` when that parameter is not given explicitly.
    cutoff_theta : float
        Elevation mask angle in degrees.  Triangles are excluded only if
        *all* their vertices are below this elevation.
    subdivision_level : int or None
        Number of icosahedral subdivisions.  If ``None``, estimated from
        ``angular_resolution``.  Typical range 0–5.
    phi_rotation : float
        Rigid azimuthal rotation applied after construction, in degrees.

    Notes
    -----
    The ``theta_lims``, ``phi_lims``, and ``cell_ids`` fields of the returned
    ``GridData`` are *synthetic* evenly-spaced arrays kept only for interface
    compatibility with ring-based grids.  They do **not** describe the actual
    triangular cell layout.  Use the ``geodesic_vertices`` column and the
    ``vertices`` array in ``GridData.vertices`` for the true geometry.

    """

    def __init__(
        self,
        angular_resolution: float = 2,
        cutoff_theta: float = 0,
        subdivision_level: int | None = None,
        phi_rotation: float = 0,
    ) -> None:
        """Initialize the geodesic grid builder.

        Parameters
        ----------
        angular_resolution : float, default 2
            Angular resolution in degrees.
        cutoff_theta : float, default 0
            Maximum polar angle cutoff in degrees.
        subdivision_level : int | None, optional
            Subdivision level override.
        phi_rotation : float, default 0
            Rotation angle in degrees.

        """
        super().__init__(angular_resolution, cutoff_theta, phi_rotation)
        self._triangles: np.ndarray | None = None

        if subdivision_level is None:
            target_edge_deg = angular_resolution * 2
            self.subdivision_level = max(
                0,
                int(np.ceil(np.log2(63.4 / target_edge_deg))),
            )
        else:
            self.subdivision_level = subdivision_level

        self._logger.info(
            f"Geodesic: subdivision_level={self.subdivision_level}, "
            f"~{20 * 4**self.subdivision_level} triangles"
        )

    def get_triangles(self) -> np.ndarray | None:
        """Return triangle vertex coordinates for visualization.

        Returns
        -------
        triangles : np.ndarray or None
            Array of shape ``(n_faces, 3, 3)`` where ``triangles[i]`` contains
            the three 3D unit-sphere vertices of triangle *i*.  ``None`` if
            the grid has not been built yet.

        """
        return self._triangles

    def get_grid_type(self) -> str:
        """Return the grid-type identifier string.

        Returns
        -------
        str
            ``"geodesic"``

        """
        return GridType.GEODESIC.value

    def _extract_triangle_vertices(
        self, vertices: np.ndarray, faces: np.ndarray
    ) -> np.ndarray:
        """Extract 3D vertex coordinates for each face.

        Parameters
        ----------
        vertices : np.ndarray
            All sphere vertices, shape ``(n_vertices, 3)``.
        faces : np.ndarray
            Face index array, shape ``(n_faces, 3)``.

        Returns
        -------
        triangles : np.ndarray
            Shape ``(n_faces, 3, 3)`` – three 3D vertices per face.

        """
        # Vectorized: use NumPy advanced indexing instead of loop
        return vertices[faces]

    def _build_grid(
        self,
    ) -> tuple[
        pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray], dict[str, Any]
    ]:
        """Build geodesic grid from subdivided icosahedron.

        Returns
        -------
        grid : pl.DataFrame
            One row per triangular cell.  Columns include phi, theta (centre),
            bounding-box limits, ``geodesic_vertices`` (3 vertex indices into
            the ``vertices`` array), and ``geodesic_subdivision``.
        theta_lims : np.ndarray
            Synthetic evenly-spaced theta limits (interface compatibility only).
        phi_lims : list[np.ndarray]
            Synthetic evenly-spaced phi limits (interface compatibility only).
        cell_ids : list[np.ndarray]
            Single-element list containing all cell ids.
        extra_kwargs : dict
            Contains ``vertices`` (shape ``(n_vertices, 3)``),
            ``vertex_phi``, and ``vertex_theta`` arrays for the full
            subdivided icosahedron.

        """
        vertices, faces = self._create_icosahedron()

        # Subdivide
        for _ in range(self.subdivision_level):
            vertices, faces = self._subdivide_mesh(vertices, faces)

        # Project to unit sphere
        vertices = vertices / np.linalg.norm(vertices, axis=1, keepdims=True)

        # Convert to spherical
        x, y, z = vertices[:, 0], vertices[:, 1], vertices[:, 2]
        theta = np.arccos(np.clip(z, -1, 1))
        phi = np.arctan2(y, x)
        phi = np.mod(phi, 2 * np.pi)

        # Filter to northern hemisphere
        hemisphere_mask = theta <= (np.pi / 2 - self.cutoff_theta_rad)

        # Filter faces
        valid_faces = []
        for face in faces:
            if any(hemisphere_mask[v] for v in face):
                valid_faces.append(face)

        if len(valid_faces) == 0:
            raise ValueError("No valid faces in hemisphere")

        valid_faces = np.array(valid_faces)

        # Create cells
        cells = []
        for face in valid_faces:
            v_indices = face
            face_phi = phi[v_indices]
            face_theta = theta[v_indices]

            # Handle phi wrapping for triangles crossing 0/2π boundary
            phi_range = np.ptp(face_phi)
            if phi_range > np.pi:
                # Triangle crosses the wraparound - unwrap relative to median
                ref_phi = np.median(face_phi)
                face_phi_unwrapped = face_phi.copy()
                # Unwrap angles that are > π away from reference
                mask_low = (ref_phi - face_phi_unwrapped) > np.pi
                mask_high = (face_phi_unwrapped - ref_phi) > np.pi
                face_phi_unwrapped[mask_low] += 2 * np.pi
                face_phi_unwrapped[mask_high] -= 2 * np.pi
                phi_min = float(np.min(face_phi_unwrapped) % (2 * np.pi))
                phi_max = float(np.max(face_phi_unwrapped) % (2 * np.pi))
            else:
                phi_min = float(np.min(face_phi))
                phi_max = float(np.max(face_phi))

            # Cell center - 3D Cartesian mean
            face_vertices_3d = vertices[v_indices]
            center_3d = np.mean(face_vertices_3d, axis=0)
            center_3d = center_3d / np.linalg.norm(center_3d)

            center_theta = np.arccos(np.clip(center_3d[2], -1, 1))
            center_phi = np.arctan2(center_3d[1], center_3d[0])
            center_phi = np.mod(center_phi, 2 * np.pi)

            # Cell bounds (theta from vertices, phi already computed above)
            theta_min = float(np.min(face_theta))
            theta_max = float(np.max(face_theta))

            cells.append(
                {
                    "phi": center_phi,
                    "theta": center_theta,
                    "phi_min": phi_min,
                    "phi_max": phi_max,
                    "theta_min": theta_min,
                    "theta_max": theta_max,
                    "geodesic_vertices": v_indices.tolist(),
                    "geodesic_subdivision": self.subdivision_level,
                }
            )

        grid = pl.DataFrame(cells).with_columns(
            pl.int_range(0, pl.len()).alias("cell_id")
        )

        extra_kwargs: dict[str, Any] = {
            "vertices": vertices,
            "vertex_phi": phi,
            "vertex_theta": theta,
        }

        theta_lims = np.linspace(0, np.pi / 2, 10)
        phi_lims = [np.linspace(0, 2 * np.pi, 20) for _ in range(len(theta_lims))]
        cell_ids_list = [np.arange(grid.height)]

        self._triangles = self._extract_triangle_vertices(vertices, faces)

        return grid, theta_lims, phi_lims, cell_ids_list, extra_kwargs

    def _create_icosahedron(self) -> tuple[np.ndarray, np.ndarray]:
        """Create a unit-sphere icosahedron.

        Returns
        -------
        vertices : np.ndarray
            Shape ``(12, 3)`` – vertices on the unit sphere.
        faces : np.ndarray
            Shape ``(20, 3)`` – integer vertex indices per triangular face.

        """
        phi_golden = (1 + np.sqrt(5)) / 2

        vertices = np.array(
            [
                [-1, phi_golden, 0],
                [1, phi_golden, 0],
                [-1, -phi_golden, 0],
                [1, -phi_golden, 0],
                [0, -1, phi_golden],
                [0, 1, phi_golden],
                [0, -1, -phi_golden],
                [0, 1, -phi_golden],
                [phi_golden, 0, -1],
                [phi_golden, 0, 1],
                [-phi_golden, 0, -1],
                [-phi_golden, 0, 1],
            ],
            dtype=np.float64,
        )

        vertices = vertices / np.linalg.norm(vertices, axis=1, keepdims=True)

        faces = np.array(
            [
                [0, 11, 5],
                [0, 5, 1],
                [0, 1, 7],
                [0, 7, 10],
                [0, 10, 11],
                [1, 5, 9],
                [5, 11, 4],
                [11, 10, 2],
                [10, 7, 6],
                [7, 1, 8],
                [3, 9, 4],
                [3, 4, 2],
                [3, 2, 6],
                [3, 6, 8],
                [3, 8, 9],
                [4, 9, 5],
                [2, 4, 11],
                [6, 2, 10],
                [8, 6, 7],
                [9, 8, 1],
            ],
            dtype=np.int64,
        )

        return vertices, faces

    def _subdivide_mesh(
        self, vertices: np.ndarray, faces: np.ndarray
    ) -> tuple[np.ndarray, np.ndarray]:
        """Subdivide each triangle into 4 smaller triangles.

        Each edge midpoint is computed, normalised onto the unit sphere, and
        cached so that shared edges produce only one new vertex.

        Parameters
        ----------
        vertices : np.ndarray
            Current vertex array, shape ``(n_vertices, 3)``.
        faces : np.ndarray
            Current face array, shape ``(n_faces, 3)``.

        Returns
        -------
        new_vertices : np.ndarray
            Expanded vertex array, shape ``(n_vertices + n_new_midpoints, 3)``.
        new_faces : np.ndarray
            New face array, shape ``(4 × n_faces, 3)``.

        """
        new_faces = []
        edge_midpoints: dict[tuple[int, int], int] = {}

        def get_midpoint(v1: int, v2: int) -> int:
            """Return midpoint vertex index for an edge.

            Parameters
            ----------
            v1 : int
                First vertex index.
            v2 : int
                Second vertex index.

            Returns
            -------
            int
                Index of the midpoint vertex.

            """
            edge = tuple(sorted([v1, v2]))
            if edge not in edge_midpoints:
                edge_midpoints[edge] = len(vertices) + len(edge_midpoints)
            return edge_midpoints[edge]

        for face in faces:
            v0, v1, v2 = face

            m01 = get_midpoint(v0, v1)
            m12 = get_midpoint(v1, v2)
            m20 = get_midpoint(v2, v0)

            new_faces.extend(
                [
                    [v0, m01, m20],
                    [v1, m12, m01],
                    [v2, m20, m12],
                    [m01, m12, m20],
                ]
            )

        n_original = len(vertices)
        n_new = len(edge_midpoints)
        final_vertices = np.zeros((n_original + n_new, 3))
        final_vertices[:n_original] = vertices

        for edge, idx in edge_midpoints.items():
            v1, v2 = edge
            midpoint = (vertices[v1] + vertices[v2]) / 2
            midpoint = midpoint / np.linalg.norm(midpoint)
            final_vertices[idx] = midpoint

        return final_vertices, np.array(new_faces)

__init__(angular_resolution=2, cutoff_theta=0, subdivision_level=None, phi_rotation=0)

Initialize the geodesic grid builder.

Parameters

angular_resolution : float, default 2 Angular resolution in degrees. cutoff_theta : float, default 0 Maximum polar angle cutoff in degrees. subdivision_level : int | None, optional Subdivision level override. phi_rotation : float, default 0 Rotation angle in degrees.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/geodesic_grid.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def __init__(
    self,
    angular_resolution: float = 2,
    cutoff_theta: float = 0,
    subdivision_level: int | None = None,
    phi_rotation: float = 0,
) -> None:
    """Initialize the geodesic grid builder.

    Parameters
    ----------
    angular_resolution : float, default 2
        Angular resolution in degrees.
    cutoff_theta : float, default 0
        Maximum polar angle cutoff in degrees.
    subdivision_level : int | None, optional
        Subdivision level override.
    phi_rotation : float, default 0
        Rotation angle in degrees.

    """
    super().__init__(angular_resolution, cutoff_theta, phi_rotation)
    self._triangles: np.ndarray | None = None

    if subdivision_level is None:
        target_edge_deg = angular_resolution * 2
        self.subdivision_level = max(
            0,
            int(np.ceil(np.log2(63.4 / target_edge_deg))),
        )
    else:
        self.subdivision_level = subdivision_level

    self._logger.info(
        f"Geodesic: subdivision_level={self.subdivision_level}, "
        f"~{20 * 4**self.subdivision_level} triangles"
    )

get_triangles()

Return triangle vertex coordinates for visualization.

Returns

triangles : np.ndarray or None Array of shape (n_faces, 3, 3) where triangles[i] contains the three 3D unit-sphere vertices of triangle i. None if the grid has not been built yet.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/geodesic_grid.py
131
132
133
134
135
136
137
138
139
140
141
142
def get_triangles(self) -> np.ndarray | None:
    """Return triangle vertex coordinates for visualization.

    Returns
    -------
    triangles : np.ndarray or None
        Array of shape ``(n_faces, 3, 3)`` where ``triangles[i]`` contains
        the three 3D unit-sphere vertices of triangle *i*.  ``None`` if
        the grid has not been built yet.

    """
    return self._triangles

get_grid_type()

Return the grid-type identifier string.

Returns

str "geodesic"

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/geodesic_grid.py
144
145
146
147
148
149
150
151
152
153
def get_grid_type(self) -> str:
    """Return the grid-type identifier string.

    Returns
    -------
    str
        ``"geodesic"``

    """
    return GridType.GEODESIC.value

Fibonacci Grid

Fibonacci sphere grid implementation.

FibonacciBuilder

Bases: BaseGridBuilder

Fibonacci sphere grid with spherical Voronoi tessellation.

Points are distributed on the sphere using the Fibonacci lattice (golden-spiral method), which provides one of the most uniform point distributions achievable on a sphere without iterative optimisation. Each point then becomes the centre of a spherical Voronoi cell — the region of the sphere closer to that point than to any other. The resulting tessellation has no polar singularities and near-uniform cell areas.

The tessellation is computed by scipy.spatial.SphericalVoronoi. Because Voronoi cells have curvilinear boundaries, the phi_min/max and theta_min/max columns in the grid are axis-aligned bounding boxes, not the true cell boundaries. They are unreliable for spatial queries — use the voronoi_region column (vertex indices into the SphericalVoronoi.vertices array) for exact geometry.

Coordinate convention (physics / GNSS)

  • phi ∈ [0, 2π) – azimuthal angle from North, clockwise (navigation convention)
  • theta ∈ [0, π/2] – polar angle from zenith (0 = straight up, π/2 = horizon)

What n_points (resolution) means

Resolution is controlled by n_points, the number of Voronoi cells in the hemisphere. When n_points is not supplied it is estimated from angular_resolution via::

cell_area  ≈ angular_resolution²   (radians²)
n_points   = max(10, round(2π / cell_area))

The approximate cell "diameter" (assuming a circular cell of equal area) is::

d ≈ 2 √(2π / n_points)   (radians)
  ≈ 2 × angular_resolution

angular_resolution therefore has no direct geometric meaning for this grid type — it is only a convenience for the n_points estimator.

Mathematical construction

  1. Full-sphere Fibonacci lattice2 × n_points points are generated on the unit sphere. Point i has::

    θᵢ = arccos(1 − 2(i + 0.5) / N) φᵢ = 2π (i + 0.5) / φ_golden (mod 2π)

where N = 2 × n_points and φ_golden = (1+√5)/2. The +0.5 offset avoids placing points exactly at the poles. 2. Hemisphere filter – points with θ > π/2 − cutoff_theta are discarded. 3. Spherical Voronoi tessellationscipy.spatial.SphericalVoronoi computes the Voronoi diagram on the unit sphere. Regions are sorted so that vertices appear in counter-clockwise order around each cell. 4. Bounding boxes – axis-aligned bounding boxes in (phi, theta) are computed from the Voronoi vertex coordinates. These are approximations only (see caveat above).

Parameters

angular_resolution : float Approximate angular resolution in degrees. Used only to estimate n_points when that parameter is not given explicitly. cutoff_theta : float Elevation mask angle in degrees. Points below this elevation are excluded before tessellation. n_points : int or None Target number of Voronoi cells in the hemisphere. If None, estimated from angular_resolution. phi_rotation : float Rigid azimuthal rotation applied after construction, in degrees.

Raises

ImportError If scipy is not installed. ValueError If fewer than 4 points survive the hemisphere filter.

Notes

The theta_lims, phi_lims, and cell_ids fields of the returned GridData are synthetic evenly-spaced arrays kept only for interface compatibility with ring-based grids. They do not describe the actual Voronoi cell layout.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/fibonacci_grid.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
class FibonacciBuilder(BaseGridBuilder):
    """Fibonacci sphere grid with spherical Voronoi tessellation.

    Points are distributed on the sphere using the *Fibonacci lattice*
    (golden-spiral method), which provides one of the most uniform
    point distributions achievable on a sphere without iterative
    optimisation.  Each point then becomes the centre of a *spherical
    Voronoi cell* — the region of the sphere closer to that point than
    to any other.  The resulting tessellation has no polar singularities
    and near-uniform cell areas.

    The tessellation is computed by ``scipy.spatial.SphericalVoronoi``.
    Because Voronoi cells have curvilinear boundaries, the ``phi_min/max``
    and ``theta_min/max`` columns in the grid are axis-aligned *bounding
    boxes*, **not** the true cell boundaries.  They are unreliable for
    spatial queries — use the ``voronoi_region`` column (vertex indices
    into the ``SphericalVoronoi.vertices`` array) for exact geometry.

    Coordinate convention (physics / GNSS)
    ---------------------------------------
    * phi  ∈ [0, 2π)  – azimuthal angle from North, clockwise (navigation convention)
    * theta ∈ [0, π/2] – polar angle from zenith (0 = straight up,
      π/2 = horizon)

    What ``n_points`` (resolution) means
    -------------------------------------
    Resolution is controlled by ``n_points``, the number of Voronoi cells
    in the hemisphere.  When ``n_points`` is not supplied it is estimated
    from ``angular_resolution`` via::

        cell_area  ≈ angular_resolution²   (radians²)
        n_points   = max(10, round(2π / cell_area))

    The approximate cell "diameter" (assuming a circular cell of equal area)
    is::

        d ≈ 2 √(2π / n_points)   (radians)
          ≈ 2 × angular_resolution

    ``angular_resolution`` therefore has **no direct geometric meaning** for
    this grid type — it is only a convenience for the ``n_points`` estimator.

    Mathematical construction
    -------------------------
    1. **Full-sphere Fibonacci lattice** – ``2 × n_points`` points are
       generated on the unit sphere.  Point *i* has::

           θᵢ = arccos(1 − 2(i + 0.5) / N)
           φᵢ = 2π (i + 0.5) / φ_golden   (mod 2π)

       where ``N = 2 × n_points`` and ``φ_golden = (1+√5)/2``.  The
       ``+0.5`` offset avoids placing points exactly at the poles.
    2. **Hemisphere filter** – points with ``θ > π/2 − cutoff_theta``
       are discarded.
    3. **Spherical Voronoi tessellation** –
       ``scipy.spatial.SphericalVoronoi`` computes the Voronoi diagram
       on the unit sphere.  Regions are sorted so that vertices appear
       in counter-clockwise order around each cell.
    4. **Bounding boxes** – axis-aligned bounding boxes in (phi, theta)
       are computed from the Voronoi vertex coordinates.  These are
       approximations only (see caveat above).

    Parameters
    ----------
    angular_resolution : float
        Approximate angular resolution in degrees.  Used only to estimate
        ``n_points`` when that parameter is not given explicitly.
    cutoff_theta : float
        Elevation mask angle in degrees.  Points below this elevation are
        excluded before tessellation.
    n_points : int or None
        Target number of Voronoi cells in the hemisphere.  If ``None``,
        estimated from ``angular_resolution``.
    phi_rotation : float
        Rigid azimuthal rotation applied after construction, in degrees.

    Raises
    ------
    ImportError
        If ``scipy`` is not installed.
    ValueError
        If fewer than 4 points survive the hemisphere filter.

    Notes
    -----
    The ``theta_lims``, ``phi_lims``, and ``cell_ids`` fields of the returned
    ``GridData`` are *synthetic* evenly-spaced arrays kept only for interface
    compatibility with ring-based grids.  They do **not** describe the actual
    Voronoi cell layout.

    """

    def __init__(
        self,
        angular_resolution: float = 2,
        cutoff_theta: float = 0,
        n_points: int | None = None,
        phi_rotation: float = 0,
    ) -> None:
        """Initialize the Fibonacci grid builder.

        Parameters
        ----------
        angular_resolution : float, default 2
            Angular resolution in degrees.
        cutoff_theta : float, default 0
            Maximum polar angle cutoff in degrees.
        n_points : int | None, optional
            Number of points to generate.
        phi_rotation : float, default 0
            Rotation angle in degrees.

        """
        super().__init__(angular_resolution, cutoff_theta, phi_rotation)

        if n_points is None:
            cell_area = self.angular_resolution_rad**2
            hemisphere_area = 2 * np.pi
            self.n_points = max(10, int(hemisphere_area / cell_area))
        else:
            self.n_points = n_points

        self._logger.info(f"Fibonacci: generating {self.n_points} points")

    def get_grid_type(self) -> str:
        """Return the grid-type identifier string.

        Returns
        -------
        str
            ``"fibonacci"``

        """
        return GridType.FIBONACCI.value

    def _build_grid(
        self,
    ) -> tuple[
        pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray], dict[str, Any]
    ]:
        """Build Fibonacci sphere grid with Voronoi tessellation.

        Returns
        -------
        grid : pl.DataFrame
            One row per Voronoi cell.  Contains phi, theta (centre),
            bounding-box limits, ``voronoi_region`` (list of vertex indices
            into the Voronoi vertex array), and ``n_vertices``.
        theta_lims : np.ndarray
            Synthetic evenly-spaced theta limits (interface compatibility only).
        phi_lims : list[np.ndarray]
            Synthetic evenly-spaced phi limits (interface compatibility only).
        cell_ids : list[np.ndarray]
            Single-element list containing all cell ids.
        extra_kwargs : dict
            Contains ``voronoi`` (the ``SphericalVoronoi`` object) and
            ``points_xyz`` (the hemisphere point cloud, shape
            ``(n_points, 3)``).

        """
        points_xyz = self._generate_fibonacci_sphere(self.n_points * 2)

        # Convert to spherical
        x, y, z = points_xyz[:, 0], points_xyz[:, 1], points_xyz[:, 2]
        theta = np.arccos(np.clip(z, -1, 1))
        phi = np.arctan2(y, x)
        phi = np.mod(phi, 2 * np.pi)

        # Filter to northern hemisphere
        mask = (theta <= (np.pi / 2 - self.cutoff_theta_rad)) & (theta >= 0)

        phi = phi[mask]
        theta = theta[mask]
        points_xyz = points_xyz[mask]

        if len(points_xyz) < 4:
            raise ValueError("Not enough points in hemisphere for Voronoi tessellation")

        # Compute spherical Voronoi tessellation
        try:
            from scipy.spatial import SphericalVoronoi

            sv = SphericalVoronoi(points_xyz, radius=1, threshold=1e-10)
            sv.sort_vertices_of_regions()

        except ImportError:
            raise ImportError(
                "scipy required for Fibonacci grid. Install: pip install scipy"
            )

        # Create cells
        cells = []
        for point_idx, (p_phi, p_theta) in enumerate(zip(phi, theta)):
            region_vertices = sv.regions[point_idx]

            if -1 in region_vertices:
                continue

            region_coords = sv.vertices[region_vertices]

            # Convert region vertices to spherical
            rv_x, rv_y, rv_z = (
                region_coords[:, 0],
                region_coords[:, 1],
                region_coords[:, 2],
            )
            rv_theta = np.arccos(np.clip(rv_z, -1, 1))
            rv_phi = np.arctan2(rv_y, rv_x)
            rv_phi = np.mod(rv_phi, 2 * np.pi)

            cells.append(
                {
                    "phi": p_phi,
                    "theta": p_theta,
                    "phi_min": np.min(rv_phi),
                    "phi_max": np.max(rv_phi),
                    "theta_min": np.min(rv_theta),
                    "theta_max": np.max(rv_theta),
                    "voronoi_region": (
                        region_vertices
                        if isinstance(region_vertices, list)
                        else region_vertices.tolist()
                    ),
                    "n_vertices": len(region_vertices),
                }
            )

        grid = pl.DataFrame(cells).with_columns(
            pl.int_range(0, pl.len()).alias("cell_id")
        )

        extra_kwargs: dict[str, Any] = {
            "voronoi": sv,
            "points_xyz": points_xyz,
        }

        theta_lims = np.linspace(0, np.pi / 2, 10)
        phi_lims = [np.linspace(0, 2 * np.pi, 20) for _ in range(len(theta_lims))]
        cell_ids_list = [np.arange(grid.height)]

        return grid, theta_lims, phi_lims, cell_ids_list, extra_kwargs

    def _generate_fibonacci_sphere(self, n: int) -> np.ndarray:
        """Generate points on the unit sphere using the golden-spiral lattice.

        Parameters
        ----------
        n : int
            Total number of points on the full sphere.

        Returns
        -------
        points : np.ndarray
            Shape ``(n, 3)`` – Cartesian (x, y, z) coordinates on the unit
            sphere.

        """
        golden_ratio = (1 + np.sqrt(5)) / 2

        indices = np.arange(0, n, dtype=np.float64) + 0.5

        # Polar angle
        theta = np.arccos(1 - 2 * indices / n)

        # Azimuthal angle
        phi = 2 * np.pi * indices / golden_ratio
        phi = np.mod(phi, 2 * np.pi)

        # Convert to Cartesian
        x = np.sin(theta) * np.cos(phi)
        y = np.sin(theta) * np.sin(phi)
        z = np.cos(theta)

        return np.column_stack([x, y, z])

__init__(angular_resolution=2, cutoff_theta=0, n_points=None, phi_rotation=0)

Initialize the Fibonacci grid builder.

Parameters

angular_resolution : float, default 2 Angular resolution in degrees. cutoff_theta : float, default 0 Maximum polar angle cutoff in degrees. n_points : int | None, optional Number of points to generate. phi_rotation : float, default 0 Rotation angle in degrees.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/fibonacci_grid.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def __init__(
    self,
    angular_resolution: float = 2,
    cutoff_theta: float = 0,
    n_points: int | None = None,
    phi_rotation: float = 0,
) -> None:
    """Initialize the Fibonacci grid builder.

    Parameters
    ----------
    angular_resolution : float, default 2
        Angular resolution in degrees.
    cutoff_theta : float, default 0
        Maximum polar angle cutoff in degrees.
    n_points : int | None, optional
        Number of points to generate.
    phi_rotation : float, default 0
        Rotation angle in degrees.

    """
    super().__init__(angular_resolution, cutoff_theta, phi_rotation)

    if n_points is None:
        cell_area = self.angular_resolution_rad**2
        hemisphere_area = 2 * np.pi
        self.n_points = max(10, int(hemisphere_area / cell_area))
    else:
        self.n_points = n_points

    self._logger.info(f"Fibonacci: generating {self.n_points} points")

get_grid_type()

Return the grid-type identifier string.

Returns

str "fibonacci"

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/fibonacci_grid.py
135
136
137
138
139
140
141
142
143
144
def get_grid_type(self) -> str:
    """Return the grid-type identifier string.

    Returns
    -------
    str
        ``"fibonacci"``

    """
    return GridType.FIBONACCI.value

HTM Grid

HTM (Hierarchical Triangular Mesh) grid implementation.

HTMBuilder

Bases: BaseGridBuilder

Hierarchical Triangular Mesh (HTM) grid.

HTM divides the sphere into an octahedron (8 triangular faces), then recursively subdivides each face into 4 smaller triangles by inserting edge-midpoint vertices projected onto the unit sphere. The recursion depth is controlled by htm_level. This produces a strictly hierarchical triangulation: every triangle at level n is the union of exactly 4 triangles at level n + 1.

Cell areas are approximately equal but not strictly so — area uniformity improves with level because the icosahedral edge-length asymmetry averages out over many subdivisions.

Coordinate convention (physics / GNSS)

  • phi ∈ [0, 2π) – azimuthal angle from North, clockwise (navigation convention)
  • theta ∈ [0, π/2] – polar angle from zenith (0 = straight up, π/2 = horizon)

Cell centres are the 3D Cartesian mean of the three triangle vertices, re-normalised onto the unit sphere.

What htm_level (resolution) means

The resolution is set by htm_level, not by angular_resolution. angular_resolution is used only to estimate an appropriate level when htm_level is not supplied explicitly. The heuristic is::

target_edge ≈ 2 × angular_resolution   (degrees)
htm_level   = min(15, ceil(log₂(90 / target_edge)))

The approximate triangle edge length at level n is::

edge ≈ 90° / 2ⁿ
Level Triangles (full sphere) Approx edge
0 8 90°
1 32 45°
2 128 22.5°
3 512 11.25°
4 2 048 5.6°
n 8 × 4ⁿ 90° / 2ⁿ

Mathematical construction

  1. Octahedron – 6 vertices at ±x, ±y, ±z on the unit sphere, forming 8 triangular faces (4 northern, 4 southern).
  2. Subdivision – for each triangle [v₀, v₁, v₂], three edge midpoints are computed and projected onto the unit sphere::

    m₀ = normalise((v₀ + v₁) / 2) m₁ = normalise((v₁ + v₂) / 2) m₂ = normalise((v₂ + v₀) / 2)

The four children are [v₀, m₀, m₂], [v₁, m₁, m₀], [v₂, m₂, m₁], and [m₀, m₁, m₂]. This is repeated htm_level times. 3. Hemisphere filter – a triangle is kept if any of its three vertices satisfies theta ≤ π/2 − cutoff_theta. Boundary triangles that straddle the horizon are therefore included and may extend slightly below it. 4. Each leaf triangle becomes one cell; its centre, bounding box, and three vertex coordinates are stored.

Parameters

angular_resolution : float Approximate angular resolution in degrees. Used only to derive htm_level when that parameter is not given explicitly. cutoff_theta : float Elevation mask angle in degrees. Triangles are excluded only when all their vertices are below this elevation. htm_level : int or None HTM subdivision depth. If None, estimated from angular_resolution. Practical range 0–15. phi_rotation : float Rigid azimuthal rotation applied after construction, in degrees.

Notes

The theta_lims, phi_lims, and cell_ids fields of the returned GridData are synthetic evenly-spaced arrays kept only for interface compatibility with ring-based grids. They do not describe the actual triangular cell layout.

HTM IDs in this implementation use a decimal-digit scheme (parent_id × 10 + child_index) which diverges from the original SDSS HTM binary-coded ID scheme. This is adequate for indexing but should not be compared with external HTM catalogues.

References

Kunszt et al. (2001): "The Hierarchical Triangular Mesh" https://www.sdss.org/dr12/algorithms/htm/

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/htm_grid.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
class HTMBuilder(BaseGridBuilder):
    """Hierarchical Triangular Mesh (HTM) grid.

    HTM divides the sphere into an octahedron (8 triangular faces), then
    recursively subdivides each face into 4 smaller triangles by inserting
    edge-midpoint vertices projected onto the unit sphere.  The recursion
    depth is controlled by ``htm_level``.  This produces a strictly
    hierarchical triangulation: every triangle at level *n* is the union of
    exactly 4 triangles at level *n* + 1.

    Cell areas are *approximately* equal but not strictly so — area
    uniformity improves with level because the icosahedral edge-length
    asymmetry averages out over many subdivisions.

    Coordinate convention (physics / GNSS)
    ---------------------------------------
    * phi  ∈ [0, 2π)  – azimuthal angle from North, clockwise (navigation convention)
    * theta ∈ [0, π/2] – polar angle from zenith (0 = straight up,
      π/2 = horizon)

    Cell centres are the 3D Cartesian mean of the three triangle vertices,
    re-normalised onto the unit sphere.

    What ``htm_level`` (resolution) means
    --------------------------------------
    The resolution is set by ``htm_level``, **not** by ``angular_resolution``.
    ``angular_resolution`` is used only to *estimate* an appropriate level
    when ``htm_level`` is not supplied explicitly.  The heuristic is::

        target_edge ≈ 2 × angular_resolution   (degrees)
        htm_level   = min(15, ceil(log₂(90 / target_edge)))

    The approximate triangle edge length at level *n* is::

        edge ≈ 90° / 2ⁿ

    | Level | Triangles (full sphere) | Approx edge |
    |-------|-------------------------|-------------|
    | 0     | 8                       | 90°         |
    | 1     | 32                      | 45°         |
    | 2     | 128                     | 22.5°       |
    | 3     | 512                     | 11.25°      |
    | 4     | 2 048                   | 5.6°        |
    | n     | 8 × 4ⁿ                  | 90° / 2ⁿ   |

    Mathematical construction
    -------------------------
    1. **Octahedron** – 6 vertices at ±x, ±y, ±z on the unit sphere, forming
       8 triangular faces (4 northern, 4 southern).
    2. **Subdivision** – for each triangle [v₀, v₁, v₂], three edge
       midpoints are computed and projected onto the unit sphere::

           m₀ = normalise((v₀ + v₁) / 2)
           m₁ = normalise((v₁ + v₂) / 2)
           m₂ = normalise((v₂ + v₀) / 2)

       The four children are [v₀, m₀, m₂], [v₁, m₁, m₀], [v₂, m₂, m₁],
       and [m₀, m₁, m₂].  This is repeated ``htm_level`` times.
    3. **Hemisphere filter** – a triangle is kept if *any* of its three
       vertices satisfies ``theta ≤ π/2 − cutoff_theta``.  Boundary
       triangles that straddle the horizon are therefore included and may
       extend slightly below it.
    4. Each leaf triangle becomes one cell; its centre, bounding box, and
       three vertex coordinates are stored.

    Parameters
    ----------
    angular_resolution : float
        Approximate angular resolution in degrees.  Used only to derive
        ``htm_level`` when that parameter is not given explicitly.
    cutoff_theta : float
        Elevation mask angle in degrees.  Triangles are excluded only when
        *all* their vertices are below this elevation.
    htm_level : int or None
        HTM subdivision depth.  If ``None``, estimated from
        ``angular_resolution``.  Practical range 0–15.
    phi_rotation : float
        Rigid azimuthal rotation applied after construction, in degrees.

    Notes
    -----
    The ``theta_lims``, ``phi_lims``, and ``cell_ids`` fields of the returned
    ``GridData`` are *synthetic* evenly-spaced arrays kept only for interface
    compatibility with ring-based grids.  They do **not** describe the actual
    triangular cell layout.

    HTM IDs in this implementation use a decimal-digit scheme
    (``parent_id × 10 + child_index``) which diverges from the original
    SDSS HTM binary-coded ID scheme.  This is adequate for indexing but
    should not be compared with external HTM catalogues.

    References
    ----------
    Kunszt et al. (2001): "The Hierarchical Triangular Mesh"
    https://www.sdss.org/dr12/algorithms/htm/

    """

    def __init__(
        self,
        angular_resolution: float = 2,
        cutoff_theta: float = 0,
        htm_level: int | None = None,
        phi_rotation: float = 0,
    ) -> None:
        """Initialize the HTM grid builder.

        Parameters
        ----------
        angular_resolution : float, default 2
            Angular resolution in degrees.
        cutoff_theta : float, default 0
            Maximum polar angle cutoff in degrees.
        htm_level : int | None, optional
            HTM subdivision level.
        phi_rotation : float, default 0
            Rotation angle in degrees.

        """
        super().__init__(angular_resolution, cutoff_theta, phi_rotation)

        if htm_level is None:
            target_edge_deg = angular_resolution * 2
            self.htm_level = max(
                0,
                int(np.ceil(np.log2(90 / target_edge_deg))),
            )
            self.htm_level = min(self.htm_level, 15)
        else:
            self.htm_level = htm_level

        self._logger.info(
            f"HTM: level={self.htm_level}, ~{8 * 4**self.htm_level} triangles"
        )

    def get_grid_type(self) -> str:
        """Return the grid-type identifier string.

        Returns
        -------
        str
            ``"htm"``

        """
        return GridType.HTM.value

    def _build_grid(
        self,
    ) -> tuple[pl.DataFrame, np.ndarray, list[np.ndarray], list[np.ndarray]]:
        """Build HTM grid by recursive octahedron subdivision.

        Returns
        -------
        grid : pl.DataFrame
            One row per triangular cell.  Contains phi, theta (centre),
            bounding-box limits, ``htm_id``, ``htm_level``, and the three
            vertex coordinate columns ``htm_vertex_0/1/2`` (each a list of
            3 floats in Cartesian xyz).
        theta_lims : np.ndarray
            Synthetic evenly-spaced theta limits (interface compatibility only).
        phi_lims : list[np.ndarray]
            Synthetic evenly-spaced phi limits (interface compatibility only).
        cell_ids : list[np.ndarray]
            Single-element list containing all cell ids.

        """
        base_vertices = np.array(
            [
                [0, 0, 1],  # 0: North pole
                [1, 0, 0],  # 1: +X
                [0, 1, 0],  # 2: +Y
                [-1, 0, 0],  # 3: -X
                [0, -1, 0],  # 4: -Y
                [0, 0, -1],  # 5: South pole
            ],
            dtype=np.float64,
        )

        base_faces = [
            [0, 1, 2],
            [0, 2, 3],
            [0, 3, 4],
            [0, 4, 1],  # Northern
            [5, 2, 1],
            [5, 3, 2],
            [5, 4, 3],
            [5, 1, 4],  # Southern
        ]

        all_triangles = []
        all_htm_ids = []

        for base_idx, base_face in enumerate(base_faces):
            v0 = base_vertices[base_face[0]]
            v1 = base_vertices[base_face[1]]
            v2 = base_vertices[base_face[2]]

            triangles, ids = self._subdivide_htm([v0, v1, v2], base_idx, self.htm_level)
            all_triangles.extend(triangles)
            all_htm_ids.extend(ids)

        # Convert to cells
        cells = []
        for tri, htm_id in zip(all_triangles, all_htm_ids):
            v0, v1, v2 = tri

            # Center
            center = (v0 + v1 + v2) / 3
            center = center / np.linalg.norm(center)

            theta_center = np.arccos(np.clip(center[2], -1, 1))
            phi_center = np.arctan2(center[1], center[0])
            phi_center = np.mod(phi_center, 2 * np.pi)

            # Filter hemisphere
            vertex_thetas = [np.arccos(np.clip(v[2], -1, 1)) for v in [v0, v1, v2]]
            if all(t > (np.pi / 2 - self.cutoff_theta_rad) for t in vertex_thetas):
                continue

            # Vertex coords
            thetas, phis = [], []
            for v in [v0, v1, v2]:
                t = np.arccos(np.clip(v[2], -1, 1))
                p = np.arctan2(v[1], v[0])
                p = np.mod(p, 2 * np.pi)
                thetas.append(t)
                phis.append(p)

            cells.append(
                {
                    "phi": phi_center,
                    "theta": theta_center,
                    "phi_min": min(phis),
                    "phi_max": max(phis),
                    "theta_min": min(thetas),
                    "theta_max": max(thetas),
                    "htm_id": htm_id,
                    "htm_level": self.htm_level,
                    "htm_vertex_0": v0.tolist(),
                    "htm_vertex_1": v1.tolist(),
                    "htm_vertex_2": v2.tolist(),
                }
            )

        grid = pl.DataFrame(cells).with_columns(
            pl.int_range(0, pl.len()).alias("cell_id")
        )

        theta_lims = np.linspace(0, np.pi / 2, 10)
        phi_lims = [np.linspace(0, 2 * np.pi, 20) for _ in range(len(theta_lims))]
        cell_ids_list = [grid["cell_id"].to_numpy()]

        return grid, theta_lims, phi_lims, cell_ids_list

    def _subdivide_htm(
        self,
        tri: list,
        htm_id: int,
        target_level: int,
        current_level: int = 0,
    ) -> tuple[list, list]:
        """Recursively subdivide a single triangle.

        Parameters
        ----------
        tri : list of np.ndarray
            Three vertex arrays [v₀, v₁, v₂], each shape ``(3,)``.
        htm_id : int
            Current HTM identifier for this triangle.
        target_level : int
            Recursion depth to reach.
        current_level : int
            Current recursion depth.

        Returns
        -------
        triangles : list of list
            Leaf triangles at ``target_level``.
        ids : list of int
            Corresponding HTM identifiers.

        """
        if current_level == target_level:
            return [tri], [htm_id]

        v0, v1, v2 = tri

        # Midpoints on sphere
        m0 = (v0 + v1) / 2
        m0 = m0 / np.linalg.norm(m0)
        m1 = (v1 + v2) / 2
        m1 = m1 / np.linalg.norm(m1)
        m2 = (v2 + v0) / 2
        m2 = m2 / np.linalg.norm(m2)

        # 4 children
        children = [[v0, m0, m2], [v1, m1, m0], [v2, m2, m1], [m0, m1, m2]]

        all_tris = []
        all_ids = []

        for child_idx, child in enumerate(children):
            child_id = htm_id * 10 + child_idx
            tris, ids = self._subdivide_htm(
                child,
                child_id,
                target_level,
                current_level + 1,
            )
            all_tris.extend(tris)
            all_ids.extend(ids)

        return all_tris, all_ids

    def get_htm_info(self) -> dict:
        """Get HTM-specific information.

        Returns
        -------
        info : dict
            Keys: ``htm_level``, ``n_triangles_full_sphere``,
            ``approx_edge_length_deg``, ``approx_edge_length_arcmin``.

        """
        n_triangles = 8 * 4**self.htm_level
        approx_edge_deg = 90 / (2**self.htm_level)

        return {
            "htm_level": self.htm_level,
            "n_triangles_full_sphere": n_triangles,
            "approx_edge_length_deg": approx_edge_deg,
            "approx_edge_length_arcmin": approx_edge_deg * 60,
        }

__init__(angular_resolution=2, cutoff_theta=0, htm_level=None, phi_rotation=0)

Initialize the HTM grid builder.

Parameters

angular_resolution : float, default 2 Angular resolution in degrees. cutoff_theta : float, default 0 Maximum polar angle cutoff in degrees. htm_level : int | None, optional HTM subdivision level. phi_rotation : float, default 0 Rotation angle in degrees.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/htm_grid.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def __init__(
    self,
    angular_resolution: float = 2,
    cutoff_theta: float = 0,
    htm_level: int | None = None,
    phi_rotation: float = 0,
) -> None:
    """Initialize the HTM grid builder.

    Parameters
    ----------
    angular_resolution : float, default 2
        Angular resolution in degrees.
    cutoff_theta : float, default 0
        Maximum polar angle cutoff in degrees.
    htm_level : int | None, optional
        HTM subdivision level.
    phi_rotation : float, default 0
        Rotation angle in degrees.

    """
    super().__init__(angular_resolution, cutoff_theta, phi_rotation)

    if htm_level is None:
        target_edge_deg = angular_resolution * 2
        self.htm_level = max(
            0,
            int(np.ceil(np.log2(90 / target_edge_deg))),
        )
        self.htm_level = min(self.htm_level, 15)
    else:
        self.htm_level = htm_level

    self._logger.info(
        f"HTM: level={self.htm_level}, ~{8 * 4**self.htm_level} triangles"
    )

get_grid_type()

Return the grid-type identifier string.

Returns

str "htm"

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/htm_grid.py
144
145
146
147
148
149
150
151
152
153
def get_grid_type(self) -> str:
    """Return the grid-type identifier string.

    Returns
    -------
    str
        ``"htm"``

    """
    return GridType.HTM.value

get_htm_info()

Get HTM-specific information.

Returns

info : dict Keys: htm_level, n_triangles_full_sphere, approx_edge_length_deg, approx_edge_length_arcmin.

Source code in packages/canvod-grids/src/canvod/grids/grids_impl/htm_grid.py
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
def get_htm_info(self) -> dict:
    """Get HTM-specific information.

    Returns
    -------
    info : dict
        Keys: ``htm_level``, ``n_triangles_full_sphere``,
        ``approx_edge_length_deg``, ``approx_edge_length_arcmin``.

    """
    n_triangles = 8 * 4**self.htm_level
    approx_edge_deg = 90 / (2**self.htm_level)

    return {
        "htm_level": self.htm_level,
        "n_triangles_full_sphere": n_triangles,
        "approx_edge_length_deg": approx_edge_deg,
        "approx_edge_length_arcmin": approx_edge_deg * 60,
    }

Grid Operations

Cell assignment and vertex extraction for hemisphere grids.

Functions in this module operate on :class:~canvod.grids.core.GridData instances and VOD xarray Datasets.

Cell assignment

add_cell_ids_to_vod_fast – vectorised KDTree lookup (preferred) add_cell_ids_to_vod – element-wise fallback add_cell_ids_to_ds_fast – dask-lazy variant for out-of-core data

Vertex / grid conversion

extract_grid_vertices – flat (x, y, z) arrays for 3-D visualisation grid_to_dataset – xarray Dataset with vertices and solid angles

add_cell_ids_to_vod_fast(vod_ds, grid, grid_name)

Assign grid cells to every observation in a VOD dataset (vectorised).

Uses a KDTree built from the grid cell centres for O(n log m) lookup.

Parameters

vod_ds : xr.Dataset VOD dataset with phi(epoch, sid) and theta(epoch, sid) coordinate variables and a VOD data variable. grid : GridData Hemisphere grid instance. grid_name : str Grid identifier used to name the output coordinate (cell_id_<grid_name>).

Returns

xr.Dataset vod_ds with an additional cell_id_<grid_name>(epoch, sid) variable. Observations with non-finite φ or θ receive NaN.

Source code in packages/canvod-grids/src/canvod/grids/operations.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def add_cell_ids_to_vod_fast(
    vod_ds: xr.Dataset, grid: GridData, grid_name: str
) -> xr.Dataset:
    """Assign grid cells to every observation in a VOD dataset (vectorised).

    Uses a KDTree built from the grid cell centres for O(n log m) lookup.

    Parameters
    ----------
    vod_ds : xr.Dataset
        VOD dataset with ``phi(epoch, sid)`` and ``theta(epoch, sid)``
        coordinate variables and a ``VOD`` data variable.
    grid : GridData
        Hemisphere grid instance.
    grid_name : str
        Grid identifier used to name the output coordinate
        (``cell_id_<grid_name>``).

    Returns
    -------
    xr.Dataset
        *vod_ds* with an additional ``cell_id_<grid_name>(epoch, sid)``
        variable.  Observations with non-finite φ or θ receive NaN.

    """
    start_time = time.time()
    print(f"\nAssigning cells for '{grid_name}'...")

    log.info(
        "cell_assignment_started",
        grid_name=grid_name,
        grid_cells=len(grid.grid),
        observations=vod_ds["VOD"].size,
        method="kdtree_fast",
    )

    tree = _build_kdtree(grid)
    cell_id_col = grid.grid["cell_id"].to_numpy()

    phi = vod_ds["phi"].values.ravel()
    theta = vod_ds["theta"].values.ravel()

    valid = np.isfinite(phi) & np.isfinite(theta)

    cell_ids = np.full(len(phi), np.nan, dtype=np.float64)

    if np.any(valid):
        cell_ids[valid] = _query_points(tree, cell_id_col, phi[valid], theta[valid])

    cell_ids_2d = cell_ids.reshape(vod_ds["VOD"].shape)

    coord_name = f"cell_id_{grid_name}"
    vod_ds[coord_name] = (("epoch", "sid"), cell_ids_2d)

    n_assigned = np.sum(np.isfinite(cell_ids_2d))
    n_unique = len(np.unique(cell_ids[np.isfinite(cell_ids)]))
    duration = time.time() - start_time

    print(f"  ✓ Assigned: {n_assigned:,} / {cell_ids_2d.size:,} observations")
    print(f"  ✓ Unique cells: {n_unique:,}")

    log.info(
        "cell_assignment_complete",
        grid_name=grid_name,
        duration_seconds=round(duration, 2),
        observations_assigned=int(n_assigned),
        observations_total=cell_ids_2d.size,
        unique_cells=int(n_unique),
        coverage_percent=round(100 * n_assigned / cell_ids_2d.size, 2),
    )

    return vod_ds

add_cell_ids_to_vod(vod_ds, grid, grid_name)

Assign grid cells to a VOD dataset (element-wise fallback).

Slower than :func:add_cell_ids_to_vod_fast; kept for cases where the full dataset does not fit in memory as numpy arrays.

Parameters

vod_ds : xr.Dataset VOD dataset with phi, theta, and VOD variables. grid : GridData Hemisphere grid instance. grid_name : str Grid identifier for the output coordinate name.

Returns

xr.Dataset vod_ds with cell_id_<grid_name>(epoch, sid) added.

Source code in packages/canvod-grids/src/canvod/grids/operations.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
def add_cell_ids_to_vod(
    vod_ds: xr.Dataset, grid: GridData, grid_name: str
) -> xr.Dataset:
    """Assign grid cells to a VOD dataset (element-wise fallback).

    Slower than :func:`add_cell_ids_to_vod_fast`; kept for cases where the
    full dataset does not fit in memory as numpy arrays.

    Parameters
    ----------
    vod_ds : xr.Dataset
        VOD dataset with ``phi``, ``theta``, and ``VOD`` variables.
    grid : GridData
        Hemisphere grid instance.
    grid_name : str
        Grid identifier for the output coordinate name.

    Returns
    -------
    xr.Dataset
        *vod_ds* with ``cell_id_<grid_name>(epoch, sid)`` added.

    """
    print(f"\nAssigning cells for '{grid_name}'...")

    tree = _build_kdtree(grid)
    cell_id_col = grid.grid["cell_id"].to_numpy()

    phi_flat = vod_ds["phi"].to_numpy().ravel()
    theta_flat = vod_ds["theta"].to_numpy().ravel()

    cell_ids_flat = np.full(vod_ds["VOD"].size, np.nan)

    for i in range(len(phi_flat)):
        if np.isfinite(phi_flat[i]) and np.isfinite(theta_flat[i]):
            cell_ids_flat[i] = _query_points(
                tree, cell_id_col, np.array([phi_flat[i]]), np.array([theta_flat[i]])
            )[0]

    cell_ids_2d = cell_ids_flat.reshape(vod_ds["VOD"].shape)

    coord_name = f"cell_id_{grid_name}"
    vod_ds[coord_name] = (("epoch", "sid"), cell_ids_2d)

    n_assigned = np.sum(~np.isnan(cell_ids_2d))
    print(f"  ✓ Added coordinate '{coord_name}'")
    print(f"  ✓ Assigned: {n_assigned:,} / {cell_ids_2d.size:,} observations")

    # Track grid references in dataset attrs
    if "grid_references" not in vod_ds.attrs:
        vod_ds.attrs["grid_references"] = []
    vod_ds.attrs["grid_references"].append(f"grids/{grid_name}")

    return vod_ds

add_cell_ids_to_ds_fast(ds, grid, grid_name, data_var='VOD')

Assign grid cells lazily via dask (avoids loading full arrays).

The output cell_id_<grid_name> variable is a dask array that computes on access or save.

Parameters

ds : xr.Dataset Dataset with dask-backed phi and theta arrays. grid : GridData Hemisphere grid instance. grid_name : str Grid identifier for the output coordinate name. data_var : str Name of the main data variable (used only for shape reference).

Returns

xr.Dataset ds with a lazy cell_id_<grid_name>(epoch, sid) variable.

Source code in packages/canvod-grids/src/canvod/grids/operations.py
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
def add_cell_ids_to_ds_fast(
    ds: xr.Dataset, grid: GridData, grid_name: str, data_var: str = "VOD"
) -> xr.Dataset:
    """Assign grid cells lazily via dask (avoids loading full arrays).

    The output ``cell_id_<grid_name>`` variable is a dask array that
    computes on access or save.

    Parameters
    ----------
    ds : xr.Dataset
        Dataset with dask-backed ``phi`` and ``theta`` arrays.
    grid : GridData
        Hemisphere grid instance.
    grid_name : str
        Grid identifier for the output coordinate name.
    data_var : str
        Name of the main data variable (used only for shape reference).

    Returns
    -------
    xr.Dataset
        *ds* with a lazy ``cell_id_<grid_name>(epoch, sid)`` variable.

    """
    import dask.array as da

    print(f"\nAssigning cells for '{grid_name}'...")

    tree = _build_kdtree(grid)
    cell_id_col = grid.grid["cell_id"].to_numpy()

    def _assign_chunk(
        phi_chunk: np.ndarray,
        theta_chunk: np.ndarray,
    ) -> np.ndarray:
        """Assign cell IDs for a chunk of data.

        Parameters
        ----------
        phi_chunk : np.ndarray
            Chunk of azimuth values.
        theta_chunk : np.ndarray
            Chunk of elevation values.

        Returns
        -------
        np.ndarray
            Chunk of cell IDs.

        """
        phi_flat = phi_chunk.ravel()
        theta_flat = theta_chunk.ravel()

        valid = np.isfinite(phi_flat) & np.isfinite(theta_flat)
        cell_ids = np.full(len(phi_flat), np.nan, dtype=np.float32)

        if np.any(valid):
            cell_ids[valid] = _query_points(
                tree, cell_id_col, phi_flat[valid], theta_flat[valid]
            )

        return cell_ids.reshape(phi_chunk.shape)

    cell_ids_dask = da.map_blocks(
        _assign_chunk,
        ds["phi"].data,
        ds["theta"].data,
        dtype=np.float32,
        drop_axis=[],
    )

    coord_name = f"cell_id_{grid_name}"
    ds[coord_name] = (("epoch", "sid"), cell_ids_dask)

    print("  ✓ Cell IDs assigned as lazy dask array")
    print("  ✓ Will compute on access/save")

    return ds

extract_grid_vertices(grid)

Extract 3D vertices from hemisphere grid cells.

Dispatches to a grid-type–specific extractor. The returned arrays are flat (not per-cell); use them directly for 3-D scatter plots.

Parameters

grid : GridData Hemisphere grid instance.

Returns

x_vertices, y_vertices, z_vertices : np.ndarray Cartesian vertex coordinates on the unit sphere.

Source code in packages/canvod-grids/src/canvod/grids/operations.py
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
def extract_grid_vertices(grid: GridData) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    """Extract 3D vertices from hemisphere grid cells.

    Dispatches to a grid-type–specific extractor.  The returned arrays are
    flat (not per-cell); use them directly for 3-D scatter plots.

    Parameters
    ----------
    grid : GridData
        Hemisphere grid instance.

    Returns
    -------
    x_vertices, y_vertices, z_vertices : np.ndarray
        Cartesian vertex coordinates on the unit sphere.

    """
    extractors = {
        "htm": _extract_htm_vertices,
        "geodesic": _extract_geodesic_vertices,
        "equal_area": _extract_rectangular_vertices,
        "equal_angle": _extract_rectangular_vertices,
        "equirectangular": _extract_rectangular_vertices,
        "healpix": _extract_healpix_vertices,
        "fibonacci": _extract_fibonacci_vertices,
    }
    extractor = extractors.get(grid.grid_type)
    if extractor is None:
        raise ValueError(f"Unknown grid type: {grid.grid_type}")
    return extractor(grid)

grid_to_dataset(grid)

Convert a HemiGrid to a unified xarray Dataset with vertices.

The returned Dataset carries cell centres, NaN-padded vertex arrays, vertex counts, and solid angles — all indexed by cell_id.

Parameters

grid : GridData Hemisphere grid instance.

Returns

xr.Dataset Dataset with dimensions (cell_id, vertex) and variables cell_phi, cell_theta, vertices_phi, vertices_theta, n_vertices, solid_angle.

Notes

This function is distinct from :meth:HemiGridStorageAdapter._prepare_vertices_dataframe in canvod-store. That method produces a long-form DataFrame for zarr ragged-array storage; this one produces a rectangular xarray Dataset suitable for analysis and visualisation.

Source code in packages/canvod-grids/src/canvod/grids/operations.py
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
def grid_to_dataset(grid: GridData) -> xr.Dataset:
    """Convert a HemiGrid to a unified xarray Dataset with vertices.

    The returned Dataset carries cell centres, NaN-padded vertex arrays,
    vertex counts, and solid angles — all indexed by ``cell_id``.

    Parameters
    ----------
    grid : GridData
        Hemisphere grid instance.

    Returns
    -------
    xr.Dataset
        Dataset with dimensions ``(cell_id, vertex)`` and variables
        ``cell_phi``, ``cell_theta``, ``vertices_phi``, ``vertices_theta``,
        ``n_vertices``, ``solid_angle``.

    Notes
    -----
    This function is distinct from
    :meth:`HemiGridStorageAdapter._prepare_vertices_dataframe` in
    ``canvod-store``. That method produces a long-form DataFrame for zarr
    ragged-array storage; this one produces a rectangular xarray Dataset
    suitable for analysis and visualisation.

    """
    # Reuse the commented-out logic pattern from gnssvodpy vertices.py:
    # extract per-cell vertices into (n_cells, max_vertices) arrays.
    n_cells = grid.ncells
    grid_type = grid.grid_type

    if grid_type in ("equal_area", "equal_angle", "equirectangular"):
        max_v = 4
        vertices_phi = np.full((n_cells, max_v), np.nan)
        vertices_theta = np.full((n_cells, max_v), np.nan)
        n_vertices = np.full(n_cells, 4, dtype=np.int32)

        for i, row in enumerate(grid.grid.iter_rows(named=True)):
            phi_min, phi_max = row["phi_min"], row["phi_max"]
            theta_min, theta_max = row["theta_min"], row["theta_max"]
            vertices_phi[i, :] = [phi_min, phi_max, phi_max, phi_min]
            vertices_theta[i, :] = [theta_min, theta_min, theta_max, theta_max]

    elif grid_type == "htm":
        max_v = 4  # padded to 4 for rectangular layout; only 3 used
        vertices_phi = np.full((n_cells, max_v), np.nan)
        vertices_theta = np.full((n_cells, max_v), np.nan)
        n_vertices = np.full(n_cells, 3, dtype=np.int32)

        for i, row in enumerate(grid.grid.iter_rows(named=True)):
            for j, col in enumerate(["htm_vertex_0", "htm_vertex_1", "htm_vertex_2"]):
                v = np.array(row[col], dtype=float)
                r = np.linalg.norm(v)
                if r == 0:
                    continue
                x, y, z = v / r
                vertices_theta[i, j] = np.arccos(np.clip(z, -1, 1))
                vertices_phi[i, j] = np.mod(np.arctan2(y, x), 2 * np.pi)

    elif grid_type == "geodesic":
        max_v = 3
        vertices_phi = np.full((n_cells, max_v), np.nan)
        vertices_theta = np.full((n_cells, max_v), np.nan)
        n_vertices = np.full(n_cells, 3, dtype=np.int32)

        shared = grid.vertices  # shared vertex array from GridData
        if shared is not None and "geodesic_vertices" in grid.grid.columns:
            shared = np.asarray(shared, dtype=float)
            for i, row in enumerate(grid.grid.iter_rows(named=True)):
                indices = row["geodesic_vertices"]
                for j, v_idx in enumerate(indices):
                    v = shared[int(v_idx)]
                    r = np.linalg.norm(v)
                    if r == 0:
                        continue
                    x, y, z = v / r
                    vertices_theta[i, j] = np.arccos(np.clip(z, -1, 1))
                    vertices_phi[i, j] = np.mod(np.arctan2(y, x), 2 * np.pi)
        else:
            # Fallback: use cell centres as single-point "vertices"
            n_vertices[:] = 1
            vertices_phi[:, 0] = grid.grid["phi"].to_numpy()
            vertices_theta[:, 0] = grid.grid["theta"].to_numpy()

    elif grid_type == "healpix":
        max_v = 4
        vertices_phi = np.full((n_cells, max_v), np.nan)
        vertices_theta = np.full((n_cells, max_v), np.nan)
        n_vertices = np.full(n_cells, 4, dtype=np.int32)

        for i, row in enumerate(grid.grid.iter_rows(named=True)):
            phi_min, phi_max = row["phi_min"], row["phi_max"]
            theta_min, theta_max = row["theta_min"], row["theta_max"]
            vertices_phi[i, :] = [phi_min, phi_max, phi_max, phi_min]
            vertices_theta[i, :] = [theta_min, theta_min, theta_max, theta_max]

    elif grid_type == "fibonacci":
        # Point-based: single vertex per cell (the centre)
        max_v = 1
        vertices_phi = grid.grid["phi"].to_numpy().reshape(n_cells, 1)
        vertices_theta = grid.grid["theta"].to_numpy().reshape(n_cells, 1)
        n_vertices = np.ones(n_cells, dtype=np.int32)

    else:
        raise ValueError(f"Unknown grid type: {grid_type}")

    # Cell centres
    cell_phi = grid.grid["phi"].to_numpy()
    cell_theta = grid.grid["theta"].to_numpy()
    solid_angles = grid.get_solid_angles()

    ds = xr.Dataset(
        {
            "cell_phi": (["cell_id"], cell_phi),
            "cell_theta": (["cell_id"], cell_theta),
            "vertices_phi": (["cell_id", "vertex"], vertices_phi),
            "vertices_theta": (["cell_id", "vertex"], vertices_theta),
            "n_vertices": (["cell_id"], n_vertices),
            "solid_angle": (["cell_id"], solid_angles),
        },
        coords={
            "cell_id": np.arange(n_cells),
            "vertex": np.arange(max_v),
        },
        attrs={
            "grid_type": grid.grid_type,
            "angular_resolution": (
                grid.metadata.get("angular_resolution", 0.0) if grid.metadata else 0.0
            ),
            "cutoff_theta": (
                grid.metadata.get("cutoff_theta", 0.0) if grid.metadata else 0.0
            ),
            "n_cells": n_cells,
        },
    )

    return ds

store_grid(grid, store, grid_name)

Store grid in unified xarray format to Icechunk store.

Converts the grid to an xarray Dataset with vertex information and writes it to the grids/ group in the store.

Parameters

grid : GridData Hemisphere grid instance to store. store Icechunk store instance (e.g., MyIcechunkStore). grid_name : str Grid identifier for storage path (e.g., 'equal_area_4deg').

Returns

str Snapshot ID from the commit.

Examples

from canvod.grids import create_hemigrid, store_grid grid = create_hemigrid(angular_resolution=4, grid_type='equal_area') snapshot_id = store_grid(grid, my_store, 'equal_area_4deg')

Source code in packages/canvod-grids/src/canvod/grids/operations.py
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
def store_grid(
    grid: GridData,
    store: Any,
    grid_name: str,
) -> str:
    """Store grid in unified xarray format to Icechunk store.

    Converts the grid to an xarray Dataset with vertex information
    and writes it to the ``grids/`` group in the store.

    Parameters
    ----------
    grid : GridData
        Hemisphere grid instance to store.
    store
        Icechunk store instance (e.g., MyIcechunkStore).
    grid_name : str
        Grid identifier for storage path (e.g., 'equal_area_4deg').

    Returns
    -------
    str
        Snapshot ID from the commit.

    Examples
    --------
    >>> from canvod.grids import create_hemigrid, store_grid
    >>> grid = create_hemigrid(angular_resolution=4, grid_type='equal_area')
    >>> snapshot_id = store_grid(grid, my_store, 'equal_area_4deg')

    """
    print(f"\nStoring grid '{grid_name}'...")

    # Convert to unified xarray format
    ds_grid = grid_to_dataset(grid)

    # Store in grids/ directory
    group_path = f"grids/{grid_name}"

    with store.writable_session() as session:
        from icechunk.xarray import to_icechunk

        to_icechunk(ds_grid, session, group=group_path, mode="w")
        snapshot_id = session.commit(f"Stored {grid_name} grid structure")

    print(f"  ✓ Stored to '{group_path}'")
    print(f"  ✓ Snapshot: {snapshot_id[:8]}...")
    print(f"  ✓ Cells: {grid.ncells}, Type: {grid.grid_type}")

    return snapshot_id

load_grid(store, grid_name)

Load a grid from Icechunk store.

Loads the grid structure from grids/{grid_name} and reconstructs a GridData object.

Parameters

store Icechunk store instance (e.g., MyIcechunkStore). grid_name : str Grid identifier (e.g., 'equal_area_4deg').

Returns

GridData Reconstructed grid instance.

Examples

from canvod.grids import load_grid grid = load_grid(my_store, 'equal_area_4deg') print(f"Loaded {grid.ncells} cells")

Source code in packages/canvod-grids/src/canvod/grids/operations.py
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
def load_grid(
    store: Any,
    grid_name: str,
) -> GridData:
    """Load a grid from Icechunk store.

    Loads the grid structure from ``grids/{grid_name}`` and reconstructs
    a GridData object.

    Parameters
    ----------
    store
        Icechunk store instance (e.g., MyIcechunkStore).
    grid_name : str
        Grid identifier (e.g., 'equal_area_4deg').

    Returns
    -------
    GridData
        Reconstructed grid instance.

    Examples
    --------
    >>> from canvod.grids import load_grid
    >>> grid = load_grid(my_store, 'equal_area_4deg')
    >>> print(f"Loaded {grid.ncells} cells")

    """
    import polars as pl

    print(f"\nLoading grid '{grid_name}'...")

    group_path = f"grids/{grid_name}"

    # Load from store
    with store.readonly_session() as session:
        ds_grid = xr.open_zarr(session.store, group=group_path, consolidated=False)

    # Extract metadata
    grid_type = ds_grid.attrs.get("grid_type")
    angular_resolution = ds_grid.attrs.get("angular_resolution", 0.0)
    cutoff_theta = ds_grid.attrs.get("cutoff_theta", np.pi / 2)

    if not grid_type:
        raise ValueError(f"Grid '{grid_name}' missing grid_type attribute")

    # Reconstruct grid DataFrame from cell centers
    cell_phi = ds_grid["cell_phi"].values
    cell_theta = ds_grid["cell_theta"].values
    n_cells = len(cell_phi)

    # Build basic grid DataFrame
    grid_data = {
        "cell_id": np.arange(n_cells),
        "phi": cell_phi,
        "theta": cell_theta,
    }

    # Add grid-type-specific columns
    if grid_type in ("equal_area", "equal_angle", "equirectangular"):
        # Reconstruct boundaries from vertices
        vertices_phi = ds_grid["vertices_phi"].values
        vertices_theta = ds_grid["vertices_theta"].values

        phi_min = vertices_phi[:, 0]  # All 4 vertices are corners
        phi_max = vertices_phi[:, 1]
        theta_min = vertices_theta[:, 0]
        theta_max = vertices_theta[:, 2]

        grid_data.update(
            {
                "phi_min": phi_min,
                "phi_max": phi_max,
                "theta_min": theta_min,
                "theta_max": theta_max,
            }
        )

    elif grid_type == "htm":
        # Reconstruct HTM vertices from spherical to Cartesian
        vertices_phi = ds_grid["vertices_phi"].values
        vertices_theta = ds_grid["vertices_theta"].values

        htm_v0 = []
        htm_v1 = []
        htm_v2 = []

        for i in range(n_cells):
            vertices = []
            for j in range(3):
                phi_v = vertices_phi[i, j]
                theta_v = vertices_theta[i, j]
                x = np.sin(theta_v) * np.cos(phi_v)
                y = np.sin(theta_v) * np.sin(phi_v)
                z = np.cos(theta_v)
                vertices.append([x, y, z])

            htm_v0.append(vertices[0])
            htm_v1.append(vertices[1])
            htm_v2.append(vertices[2])

        grid_data.update(
            {
                "htm_vertex_0": htm_v0,
                "htm_vertex_1": htm_v1,
                "htm_vertex_2": htm_v2,
            }
        )

    # Create Polars DataFrame
    df_grid = pl.DataFrame(grid_data)

    # Reconstruct theta_lims, phi_lims, and cell_ids from grid
    # These are required for GridData but not critical for basic usage
    unique_theta = sorted(df_grid["theta"].unique().to_list())
    theta_lims = np.array(unique_theta)

    # Group cells by theta bin
    phi_lims_list = []
    cell_ids_list = []
    for theta_val in unique_theta:
        theta_cells = df_grid.filter(pl.col("theta") == theta_val)
        phi_vals = sorted(theta_cells["phi"].to_list())
        cell_ids = theta_cells["cell_id"].to_list()

        phi_lims_list.append(np.array(phi_vals))
        cell_ids_list.append(np.array(cell_ids))

    # Create GridData instance
    from canvod.grids.core import GridData

    grid = GridData(
        grid=df_grid,
        theta_lims=theta_lims,
        phi_lims=phi_lims_list,
        cell_ids=cell_ids_list,
        grid_type=grid_type,
        metadata={
            "angular_resolution": angular_resolution,
            "cutoff_theta": cutoff_theta,
        },
    )

    print(f"  ✓ Loaded from '{group_path}'")
    print(f"  ✓ Cells: {grid.ncells}, Type: {grid.grid_type}")

    return grid

Aggregation

Per-cell aggregation of VOD observations onto hemisphere grids.

Top-level entry points

aggregate_data_to_grid – single-statistic spatial aggregation compute_percell_timeseries – chunked (cell × time) time-series

Analysis helpers

compute_global_average – observation-count–weighted global mean compute_regional_average – same, restricted to a cell subset analyze_diurnal_patterns – hourly groupby analyze_spatial_patterns – time-averaged spatial field

Convenience wrappers

compute_hemisphere_percell – daily, full hemisphere compute_zenith_percell – daily, θ ≤ 30°

CellAggregator

Polars-based per-cell aggregation helpers.

Source code in packages/canvod-grids/src/canvod/grids/aggregation.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
class CellAggregator:
    """Polars-based per-cell aggregation helpers."""

    @staticmethod
    def aggregate_by_cell(
        df: pl.DataFrame,
        value_var: str = "VOD",
        method: str = "mean",
    ) -> pl.DataFrame:
        """Aggregate values by ``cell_id``.

        Parameters
        ----------
        df : pl.DataFrame
            Must contain ``cell_id`` and *value_var* columns.
        value_var : str
            Column to aggregate.
        method : {'mean', 'median', 'std', 'count'}
            Aggregation method.

        Returns
        -------
        pl.DataFrame
            Two-column DataFrame: ``cell_id``, *value_var*.

        """
        if "cell_id" not in df.columns:
            raise ValueError("pl.DataFrame must have 'cell_id' column")
        if value_var not in df.columns:
            raise ValueError(f"pl.DataFrame must have '{value_var}' column")

        agg_map = {
            "mean": pl.col(value_var).mean(),
            "median": pl.col(value_var).median(),
            "std": pl.col(value_var).std(),
            "count": pl.col(value_var).count(),
        }
        if method not in agg_map:
            raise ValueError(f"Unknown method: {method}")

        return (
            df.group_by("cell_id").agg(agg_map[method].alias(value_var)).sort("cell_id")
        )

aggregate_by_cell(df, value_var='VOD', method='mean') staticmethod

Aggregate values by cell_id.

Parameters

df : pl.DataFrame Must contain cell_id and value_var columns. value_var : str Column to aggregate. method : {'mean', 'median', 'std', 'count'} Aggregation method.

Returns

pl.DataFrame Two-column DataFrame: cell_id, value_var.

Source code in packages/canvod-grids/src/canvod/grids/aggregation.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
@staticmethod
def aggregate_by_cell(
    df: pl.DataFrame,
    value_var: str = "VOD",
    method: str = "mean",
) -> pl.DataFrame:
    """Aggregate values by ``cell_id``.

    Parameters
    ----------
    df : pl.DataFrame
        Must contain ``cell_id`` and *value_var* columns.
    value_var : str
        Column to aggregate.
    method : {'mean', 'median', 'std', 'count'}
        Aggregation method.

    Returns
    -------
    pl.DataFrame
        Two-column DataFrame: ``cell_id``, *value_var*.

    """
    if "cell_id" not in df.columns:
        raise ValueError("pl.DataFrame must have 'cell_id' column")
    if value_var not in df.columns:
        raise ValueError(f"pl.DataFrame must have '{value_var}' column")

    agg_map = {
        "mean": pl.col(value_var).mean(),
        "median": pl.col(value_var).median(),
        "std": pl.col(value_var).std(),
        "count": pl.col(value_var).count(),
    }
    if method not in agg_map:
        raise ValueError(f"Unknown method: {method}")

    return (
        df.group_by("cell_id").agg(agg_map[method].alias(value_var)).sort("cell_id")
    )

aggregate_data_to_grid(data_ds, grid, value_var='VOD', cell_var='cell_id_equal_area_2deg', sid=None, time_range=None, stat='median')

Aggregate VOD data across all timestamps and SIDs to per-cell statistics.

Parameters

data_ds : xr.Dataset Full VOD dataset from the Icechunk store. grid : GridData Grid definition. value_var : str Name of the VOD variable. cell_var : str Name of the cell-ID variable in data_ds. sid : list[str], optional Satellite IDs to include. None → all. time_range : tuple, optional (start, end) datetimes for epoch filtering. stat : {'mean', 'median', 'std'} Statistic to compute per cell.

Returns

np.ndarray Array of length grid.ncells with per-cell aggregated values (NaN where no observations exist).

Source code in packages/canvod-grids/src/canvod/grids/aggregation.py
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def aggregate_data_to_grid(
    data_ds: xr.Dataset,
    grid: GridData,
    value_var: str = "VOD",
    cell_var: str = "cell_id_equal_area_2deg",
    sid: list[str] | None = None,
    time_range: tuple | None = None,
    stat: str = "median",
) -> np.ndarray:
    """Aggregate VOD data across all timestamps and SIDs to per-cell statistics.

    Parameters
    ----------
    data_ds : xr.Dataset
        Full VOD dataset from the Icechunk store.
    grid : GridData
        Grid definition.
    value_var : str
        Name of the VOD variable.
    cell_var : str
        Name of the cell-ID variable in *data_ds*.
    sid : list[str], optional
        Satellite IDs to include.  ``None`` → all.
    time_range : tuple, optional
        ``(start, end)`` datetimes for epoch filtering.
    stat : {'mean', 'median', 'std'}
        Statistic to compute per cell.

    Returns
    -------
    np.ndarray
        Array of length ``grid.ncells`` with per-cell aggregated values
        (NaN where no observations exist).

    """
    vod = data_ds[value_var]
    cell_ids = data_ds[cell_var]

    if sid is not None and "sid" in vod.dims:
        vod = vod.sel(sid=sid)
        cell_ids = cell_ids.sel(sid=sid)
        print(f"Using SIDs: {sid}")
    elif not sid and "sid" in vod.dims:
        available_sids = vod.sid.values.tolist()
        print(f"Using all available SIDs: {available_sids}")
    else:
        raise ValueError("No SID dimension in data.")

    if time_range is not None:
        vod = vod.sel(epoch=slice(time_range[0], time_range[1]))
        cell_ids = cell_ids.sel(epoch=slice(time_range[0], time_range[1]))

    vod_flat = np.asarray(vod.values).ravel()
    cell_flat = np.asarray(cell_ids.values).ravel()
    valid = np.isfinite(vod_flat) & np.isfinite(cell_flat)

    df = pl.DataFrame(
        {
            "cell_id": cell_flat[valid].astype(np.int64),
            "vod": vod_flat[valid],
        }
    )

    agg_expr = {
        "mean": pl.col("vod").mean(),
        "median": pl.col("vod").median(),
        "std": pl.col("vod").std(),
    }
    if stat not in agg_expr:
        raise ValueError(f"Unsupported stat: {stat}")

    agg = df.group_by("cell_id").agg(agg_expr[stat].alias("vod_stat"))

    result = np.full(grid.ncells, np.nan, dtype=float)
    result[agg["cell_id"].to_numpy()] = agg["vod_stat"].to_numpy()
    return result

compute_percell_timeseries(data_ds, grid, value_var='VOD', cell_var='cell_id_equal_area_2deg', theta_range=None, phi_range=None, selected_sids=None, time_range=None, temporal_resolution='1D', chunk_days=21, min_obs_per_cell_time=1)

Compute time series per cell with SID aggregation.

Processing is chunked over time to bound memory usage.

Parameters

data_ds : xr.Dataset Full VOD dataset. grid : GridData Grid definition. value_var : str VOD variable name. cell_var : str Cell-ID variable name. theta_range : tuple, optional (min_deg, max_deg) elevation filter. phi_range : tuple, optional (min_deg, max_deg) azimuth filter (wraps at 360°). selected_sids : list[str], optional Satellite IDs to include. time_range : tuple, optional (start, end) epoch slice. temporal_resolution : str Pandas/polars frequency string (e.g. "1D", "30min"). chunk_days : int Days per processing chunk. min_obs_per_cell_time : int Minimum SID observations per (cell, time-bin) to retain.

Returns

xr.Dataset Dataset with dimensions (cell, time) and variables cell_timeseries, cell_weights, cell_counts, cell_theta, cell_phi.

Source code in packages/canvod-grids/src/canvod/grids/aggregation.py
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
def compute_percell_timeseries(
    data_ds: xr.Dataset,
    grid: GridData,
    value_var: str = "VOD",
    cell_var: str = "cell_id_equal_area_2deg",
    theta_range: tuple[float, float] | None = None,
    phi_range: tuple[float, float] | None = None,
    selected_sids: list[str] | None = None,
    time_range: tuple | None = None,
    temporal_resolution: str = "1D",
    chunk_days: int = 21,
    min_obs_per_cell_time: int = 1,
) -> xr.Dataset:
    """Compute time series per cell with SID aggregation.

    Processing is chunked over time to bound memory usage.

    Parameters
    ----------
    data_ds : xr.Dataset
        Full VOD dataset.
    grid : GridData
        Grid definition.
    value_var : str
        VOD variable name.
    cell_var : str
        Cell-ID variable name.
    theta_range : tuple, optional
        ``(min_deg, max_deg)`` elevation filter.
    phi_range : tuple, optional
        ``(min_deg, max_deg)`` azimuth filter (wraps at 360°).
    selected_sids : list[str], optional
        Satellite IDs to include.
    time_range : tuple, optional
        ``(start, end)`` epoch slice.
    temporal_resolution : str
        Pandas/polars frequency string (e.g. ``"1D"``, ``"30min"``).
    chunk_days : int
        Days per processing chunk.
    min_obs_per_cell_time : int
        Minimum SID observations per (cell, time-bin) to retain.

    Returns
    -------
    xr.Dataset
        Dataset with dimensions ``(cell, time)`` and variables
        ``cell_timeseries``, ``cell_weights``, ``cell_counts``,
        ``cell_theta``, ``cell_phi``.

    """
    print("📍 PER-CELL TIME SERIES AGGREGATION")
    print("=" * 60)
    print(f"📦 Chunk size: {chunk_days} days")
    print(f"🕒 Resolution: {temporal_resolution}")

    start_time = time.time()

    selected_cells = _create_spatial_selection(grid, theta_range, phi_range)
    print(f"📍 Selected cells: {len(selected_cells)}")

    if selected_sids is not None and "sid" in data_ds.dims:
        data_ds = data_ds.sel(sid=selected_sids)
        print(f"🛰️  Selected SIDs: {len(selected_sids)}")

    if time_range is not None:
        data_ds = data_ds.sel(epoch=slice(time_range[0], time_range[1]))

    print(f"📊 Data shape: {data_ds[value_var].shape}")

    time_start, time_end = data_ds.epoch.values[0], data_ds.epoch.values[-1]

    # Normalise frequency string for pandas
    pandas_freq = _normalise_pandas_freq(temporal_resolution)

    output_times = pd.date_range(
        start=pd.to_datetime(time_start),
        end=pd.to_datetime(time_end),
        freq=pandas_freq,
    )

    n_times = len(output_times)
    n_cells = len(selected_cells)

    print(f"📅 Output shape: {n_cells} cells × {n_times} time bins")

    # Result arrays (cell × time)
    cell_timeseries = np.full((n_cells, n_times), np.nan)
    cell_weights = np.zeros((n_cells, n_times))
    cell_counts = np.zeros((n_cells, n_times), dtype=int)

    cell_to_idx = {int(cell_id): i for i, cell_id in enumerate(selected_cells)}
    time_to_idx = {pd.Timestamp(t): i for i, t in enumerate(output_times)}

    chunk_starts = pd.date_range(
        start=pd.to_datetime(time_start),
        end=pd.to_datetime(time_end),
        freq=f"{chunk_days}D",
    )

    print(f"🔄 Processing {len(chunk_starts)} chunks...")

    for chunk_start in tqdm(chunk_starts, desc="Processing chunks"):
        chunk_end = min(
            chunk_start + pd.Timedelta(days=chunk_days),
            pd.to_datetime(time_end),
        )
        chunk_data = data_ds.sel(epoch=slice(chunk_start, chunk_end))

        if len(chunk_data.epoch) == 0:
            continue

        chunk_results = _process_chunk_percell(
            chunk_data,
            selected_cells,
            temporal_resolution,
            value_var,
            cell_var,
            min_obs_per_cell_time,
        )

        if chunk_results:
            _merge_percell_results(
                chunk_results,
                cell_timeseries,
                cell_weights,
                cell_counts,
                cell_to_idx,
                time_to_idx,
            )

        del chunk_data
        gc.collect()

    processing_time = time.time() - start_time

    result_ds = _create_percell_dataset(
        cell_timeseries,
        cell_weights,
        cell_counts,
        selected_cells,
        output_times,
        grid,
        processing_time,
        theta_range,
        phi_range,
        temporal_resolution,
        chunk_days,
    )

    valid_cell_times = np.sum(np.isfinite(cell_timeseries))
    total_cell_times = n_cells * n_times
    coverage = (valid_cell_times / total_cell_times) * 100

    print("\n✅ PER-CELL AGGREGATION COMPLETE!")
    print(f"⚡ Time: {processing_time / 60:.2f} minutes")
    print(f"📊 Output: {n_cells} cells × {n_times} times")
    print(f"📈 Coverage: {valid_cell_times:,} / {total_cell_times:,} ({coverage:.1f}%)")
    print("🎯 Ready for diurnal analysis, spatial patterns, custom aggregations!")

    return result_ds

compute_global_average(percell_ds)

Compute observation-count–weighted global average from per-cell data.

Parameters

percell_ds : xr.Dataset Output of :func:compute_percell_timeseries.

Returns

xr.Dataset Variables: global_timeseries, spatial_std, total_weights, active_cells.

Source code in packages/canvod-grids/src/canvod/grids/aggregation.py
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
def compute_global_average(percell_ds: xr.Dataset) -> xr.Dataset:
    """Compute observation-count–weighted global average from per-cell data.

    Parameters
    ----------
    percell_ds : xr.Dataset
        Output of :func:`compute_percell_timeseries`.

    Returns
    -------
    xr.Dataset
        Variables: ``global_timeseries``, ``spatial_std``,
        ``total_weights``, ``active_cells``.

    """
    weights = percell_ds.cell_weights
    values = percell_ds.cell_timeseries

    weighted_sum = (values * weights).sum(dim="cell", skipna=True)
    total_weights = weights.sum(dim="cell", skipna=True)

    global_mean = weighted_sum / total_weights

    valid_mask = np.isfinite(values)
    spatial_std = values.where(valid_mask).std(dim="cell", skipna=True)

    return xr.Dataset(
        {
            "global_timeseries": global_mean,
            "spatial_std": spatial_std,
            "total_weights": total_weights,
            "active_cells": valid_mask.sum(dim="cell"),
        }
    )

compute_regional_average(percell_ds, region_cells)

Compute observation-count–weighted average for a cell subset.

Parameters

percell_ds : xr.Dataset Output of :func:compute_percell_timeseries. region_cells : array-like Cell IDs defining the region.

Returns

xr.DataArray Weighted regional mean time series.

Source code in packages/canvod-grids/src/canvod/grids/aggregation.py
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
def compute_regional_average(
    percell_ds: xr.Dataset, region_cells: list[int] | np.ndarray
) -> xr.DataArray:
    """Compute observation-count–weighted average for a cell subset.

    Parameters
    ----------
    percell_ds : xr.Dataset
        Output of :func:`compute_percell_timeseries`.
    region_cells : array-like
        Cell IDs defining the region.

    Returns
    -------
    xr.DataArray
        Weighted regional mean time series.

    """
    regional_data = percell_ds.sel(cell=region_cells)

    weights = regional_data.cell_weights
    values = regional_data.cell_timeseries

    weighted_sum = (values * weights).sum(dim="cell", skipna=True)
    total_weights = weights.sum(dim="cell", skipna=True)

    return weighted_sum / total_weights

analyze_diurnal_patterns(percell_ds)

Compute hourly means from per-cell time series.

Parameters

percell_ds : xr.Dataset Output of :func:compute_percell_timeseries.

Returns

xr.Dataset Grouped by time.hour.

Source code in packages/canvod-grids/src/canvod/grids/aggregation.py
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
def analyze_diurnal_patterns(percell_ds: xr.Dataset) -> xr.Dataset:
    """Compute hourly means from per-cell time series.

    Parameters
    ----------
    percell_ds : xr.Dataset
        Output of :func:`compute_percell_timeseries`.

    Returns
    -------
    xr.Dataset
        Grouped by ``time.hour``.

    """
    return percell_ds.groupby("time.hour").mean(dim="time")

analyze_spatial_patterns(percell_ds)

Compute time-averaged spatial field.

Parameters

percell_ds : xr.Dataset Output of :func:compute_percell_timeseries.

Returns

xr.Dataset Time-averaged dataset.

Source code in packages/canvod-grids/src/canvod/grids/aggregation.py
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
def analyze_spatial_patterns(percell_ds: xr.Dataset) -> xr.Dataset:
    """Compute time-averaged spatial field.

    Parameters
    ----------
    percell_ds : xr.Dataset
        Output of :func:`compute_percell_timeseries`.

    Returns
    -------
    xr.Dataset
        Time-averaged dataset.

    """
    return percell_ds.mean(dim="time")

compute_hemisphere_percell(data_ds, grid, **kwargs)

Daily per-cell time series for the full hemisphere.

Source code in packages/canvod-grids/src/canvod/grids/aggregation.py
447
448
449
450
451
452
453
454
455
def compute_hemisphere_percell(
    data_ds: xr.Dataset,
    grid: GridData,
    **kwargs: Any,
) -> xr.Dataset:
    """Daily per-cell time series for the full hemisphere."""
    return compute_percell_timeseries(
        data_ds=data_ds, grid=grid, temporal_resolution="1D", **kwargs
    )

compute_zenith_percell(data_ds, grid, **kwargs)

Daily per-cell time series restricted to θ ≤ 30°.

Source code in packages/canvod-grids/src/canvod/grids/aggregation.py
458
459
460
461
462
463
464
465
466
467
468
469
470
def compute_zenith_percell(
    data_ds: xr.Dataset,
    grid: GridData,
    **kwargs: Any,
) -> xr.Dataset:
    """Daily per-cell time series restricted to θ ≤ 30°."""
    return compute_percell_timeseries(
        data_ds=data_ds,
        grid=grid,
        theta_range=(0, 30),
        temporal_resolution="1D",
        **kwargs,
    )

Analysis

Filtering

Global (dataset-wide) outlier filters for gridded VOD data.

Classes

Filter – abstract base; compute_mask / apply contract. ZScoreFilter – mean ± k·σ rejection. IQRFilter – Q1 – f·IQR / Q3 + f·IQR rejection. RangeFilter – hard min/max bounds. PercentileFilter – lower/upper percentile bounds. CustomFilter – user-supplied callable mask. FilterPipeline – sequential or combined multi-filter application.

Convenience functions

create_zscore_filter – one-liner z-score filter. create_range_filter – one-liner range filter.

Notes

  • Filters never modify original data. apply returns a new xr.Dataset with <var>_filtered_<n> and mask_<n> variables appended.
  • Both numpy and dask-backed arrays are supported; dask paths compute only the scalar statistics eagerly while the mask itself stays lazy.

Filter

Bases: ABC

Base class for all filters. Filters NEVER modify original data.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
class Filter(ABC):
    """Base class for all filters.  Filters NEVER modify original data."""

    def __init__(self, name: str) -> None:
        """Initialize the filter.

        Parameters
        ----------
        name : str
            Filter name.

        """
        self.name = name
        self.metadata: dict = {
            "filter_type": self.__class__.__name__,
            "timestamp": datetime.now().isoformat(),
        }

    @abstractmethod
    def compute_mask(
        self,
        data: xr.DataArray,
        **kwargs: Any,
    ) -> xr.DataArray:
        """Compute boolean mask (True = keep, False = remove)."""
        ...

    def apply(
        self,
        ds: xr.Dataset,
        var_name: str,
        output_suffix: str | None = None,
        **kwargs: Any,
    ) -> xr.Dataset:
        """Apply filter to *ds*, returning a copy with filtered variable added.

        New variables
        -------------
        ``<var_name>_filtered_<suffix>`` : filtered data (NaN where masked).
        ``mask_<suffix>``               : boolean keep-mask.
        """
        suffix = output_suffix or self.name

        data = ds[var_name]
        mask = self.compute_mask(data, **kwargs)
        filtered_data = data.where(mask)

        n_total = int(mask.size)
        n_removed = int((~mask).sum().values)

        metadata = {
            **self.metadata,
            **kwargs,
            "applied_to": var_name,
            "n_total": n_total,
            "n_removed": n_removed,
            "fraction_removed": float(n_removed / n_total),
            "filter_chain": [self.name],
        }

        ds_out = ds.copy()

        filtered_var_name = f"{var_name}_filtered_{suffix}"
        mask_var_name = f"mask_{suffix}"

        ds_out[filtered_var_name] = filtered_data
        ds_out[filtered_var_name].attrs = metadata

        ds_out[mask_var_name] = mask
        ds_out[mask_var_name].attrs = {
            "description": f"Boolean mask for {self.name} filter",
            "True": "keep",
            "False": "filtered out",
            **metadata,
        }

        return ds_out

__init__(name)

Initialize the filter.

Parameters

name : str Filter name.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def __init__(self, name: str) -> None:
    """Initialize the filter.

    Parameters
    ----------
    name : str
        Filter name.

    """
    self.name = name
    self.metadata: dict = {
        "filter_type": self.__class__.__name__,
        "timestamp": datetime.now().isoformat(),
    }

compute_mask(data, **kwargs) abstractmethod

Compute boolean mask (True = keep, False = remove).

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
62
63
64
65
66
67
68
69
@abstractmethod
def compute_mask(
    self,
    data: xr.DataArray,
    **kwargs: Any,
) -> xr.DataArray:
    """Compute boolean mask (True = keep, False = remove)."""
    ...

apply(ds, var_name, output_suffix=None, **kwargs)

Apply filter to ds, returning a copy with filtered variable added.

New variables

<var_name>_filtered_<suffix> : filtered data (NaN where masked). mask_<suffix> : boolean keep-mask.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def apply(
    self,
    ds: xr.Dataset,
    var_name: str,
    output_suffix: str | None = None,
    **kwargs: Any,
) -> xr.Dataset:
    """Apply filter to *ds*, returning a copy with filtered variable added.

    New variables
    -------------
    ``<var_name>_filtered_<suffix>`` : filtered data (NaN where masked).
    ``mask_<suffix>``               : boolean keep-mask.
    """
    suffix = output_suffix or self.name

    data = ds[var_name]
    mask = self.compute_mask(data, **kwargs)
    filtered_data = data.where(mask)

    n_total = int(mask.size)
    n_removed = int((~mask).sum().values)

    metadata = {
        **self.metadata,
        **kwargs,
        "applied_to": var_name,
        "n_total": n_total,
        "n_removed": n_removed,
        "fraction_removed": float(n_removed / n_total),
        "filter_chain": [self.name],
    }

    ds_out = ds.copy()

    filtered_var_name = f"{var_name}_filtered_{suffix}"
    mask_var_name = f"mask_{suffix}"

    ds_out[filtered_var_name] = filtered_data
    ds_out[filtered_var_name].attrs = metadata

    ds_out[mask_var_name] = mask
    ds_out[mask_var_name].attrs = {
        "description": f"Boolean mask for {self.name} filter",
        "True": "keep",
        "False": "filtered out",
        **metadata,
    }

    return ds_out

ZScoreFilter

Bases: Filter

Remove statistical outliers using z-score method.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
class ZScoreFilter(Filter):
    """Remove statistical outliers using z-score method."""

    def __init__(self) -> None:
        """Initialize the filter."""
        super().__init__("zscore")

    def compute_mask(self, data: xr.DataArray, threshold: float = 3.0) -> xr.DataArray:
        """Compute z-score mask.

        Parameters
        ----------
        data : xr.DataArray
            Input data.
        threshold : float
            Z-score threshold (default: 3.0).

        Returns
        -------
        xr.DataArray
            Boolean mask (True = keep).

        """
        if isinstance(data.data, da.Array):
            mean = da.nanmean(data.data).compute()
            std = da.nanstd(data.data).compute()
            z_scores = da.fabs((data.data - mean) / std)
            mask_data = z_scores <= threshold
            mask = xr.DataArray(mask_data, dims=data.dims, coords=data.coords)
        else:
            mean = data.mean(skipna=True)
            std = data.std(skipna=True)
            z_scores = np.abs((data - mean) / std)
            mask = z_scores <= threshold

        return mask

__init__()

Initialize the filter.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
131
132
133
def __init__(self) -> None:
    """Initialize the filter."""
    super().__init__("zscore")

compute_mask(data, threshold=3.0)

Compute z-score mask.

Parameters

data : xr.DataArray Input data. threshold : float Z-score threshold (default: 3.0).

Returns

xr.DataArray Boolean mask (True = keep).

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def compute_mask(self, data: xr.DataArray, threshold: float = 3.0) -> xr.DataArray:
    """Compute z-score mask.

    Parameters
    ----------
    data : xr.DataArray
        Input data.
    threshold : float
        Z-score threshold (default: 3.0).

    Returns
    -------
    xr.DataArray
        Boolean mask (True = keep).

    """
    if isinstance(data.data, da.Array):
        mean = da.nanmean(data.data).compute()
        std = da.nanstd(data.data).compute()
        z_scores = da.fabs((data.data - mean) / std)
        mask_data = z_scores <= threshold
        mask = xr.DataArray(mask_data, dims=data.dims, coords=data.coords)
    else:
        mean = data.mean(skipna=True)
        std = data.std(skipna=True)
        z_scores = np.abs((data - mean) / std)
        mask = z_scores <= threshold

    return mask

IQRFilter

Bases: Filter

Remove outliers using Interquartile Range method.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
class IQRFilter(Filter):
    """Remove outliers using Interquartile Range method."""

    def __init__(self) -> None:
        """Initialize the filter."""
        super().__init__("iqr")

    def compute_mask(self, data: xr.DataArray, factor: float = 1.5) -> xr.DataArray:
        """Compute IQR mask.

        Parameters
        ----------
        data : xr.DataArray
            Input data.
        factor : float
            IQR factor (default: 1.5).

        Returns
        -------
        xr.DataArray
            Boolean mask (True = keep).

        """
        if isinstance(data.data, da.Array):
            flat_data = data.data.ravel()
            q1_val = da.percentile(flat_data, 25, method="linear").compute()
            q3_val = da.percentile(flat_data, 75, method="linear").compute()

            iqr = q3_val - q1_val
            lower_bound = q1_val - factor * iqr
            upper_bound = q3_val + factor * iqr

            mask_data = (data.data >= lower_bound) & (data.data <= upper_bound)
            mask = xr.DataArray(mask_data, dims=data.dims, coords=data.coords)
        else:
            q1 = data.quantile(0.25, skipna=True)
            q3 = data.quantile(0.75, skipna=True)
            iqr = q3 - q1

            lower_bound = q1 - factor * iqr
            upper_bound = q3 + factor * iqr

            mask = (data >= lower_bound) & (data <= upper_bound)

        return mask

__init__()

Initialize the filter.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
169
170
171
def __init__(self) -> None:
    """Initialize the filter."""
    super().__init__("iqr")

compute_mask(data, factor=1.5)

Compute IQR mask.

Parameters

data : xr.DataArray Input data. factor : float IQR factor (default: 1.5).

Returns

xr.DataArray Boolean mask (True = keep).

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
def compute_mask(self, data: xr.DataArray, factor: float = 1.5) -> xr.DataArray:
    """Compute IQR mask.

    Parameters
    ----------
    data : xr.DataArray
        Input data.
    factor : float
        IQR factor (default: 1.5).

    Returns
    -------
    xr.DataArray
        Boolean mask (True = keep).

    """
    if isinstance(data.data, da.Array):
        flat_data = data.data.ravel()
        q1_val = da.percentile(flat_data, 25, method="linear").compute()
        q3_val = da.percentile(flat_data, 75, method="linear").compute()

        iqr = q3_val - q1_val
        lower_bound = q1_val - factor * iqr
        upper_bound = q3_val + factor * iqr

        mask_data = (data.data >= lower_bound) & (data.data <= upper_bound)
        mask = xr.DataArray(mask_data, dims=data.dims, coords=data.coords)
    else:
        q1 = data.quantile(0.25, skipna=True)
        q3 = data.quantile(0.75, skipna=True)
        iqr = q3 - q1

        lower_bound = q1 - factor * iqr
        upper_bound = q3 + factor * iqr

        mask = (data >= lower_bound) & (data <= upper_bound)

    return mask

RangeFilter

Bases: Filter

Filter values outside specified range.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
class RangeFilter(Filter):
    """Filter values outside specified range."""

    def __init__(self) -> None:
        """Initialize the filter."""
        super().__init__("range")

    def compute_mask(
        self,
        data: xr.DataArray,
        min_value: float | None = None,
        max_value: float | None = None,
    ) -> xr.DataArray:
        """Compute range mask.

        Parameters
        ----------
        data : xr.DataArray
            Input data.
        min_value : float, optional
            Minimum allowed value.
        max_value : float, optional
            Maximum allowed value.

        Returns
        -------
        xr.DataArray
            Boolean mask (True = keep).

        """
        mask = xr.ones_like(data, dtype=bool)

        if min_value is not None:
            mask = mask & (data >= min_value)
        if max_value is not None:
            mask = mask & (data <= max_value)

        return mask

__init__()

Initialize the filter.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
216
217
218
def __init__(self) -> None:
    """Initialize the filter."""
    super().__init__("range")

compute_mask(data, min_value=None, max_value=None)

Compute range mask.

Parameters

data : xr.DataArray Input data. min_value : float, optional Minimum allowed value. max_value : float, optional Maximum allowed value.

Returns

xr.DataArray Boolean mask (True = keep).

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
def compute_mask(
    self,
    data: xr.DataArray,
    min_value: float | None = None,
    max_value: float | None = None,
) -> xr.DataArray:
    """Compute range mask.

    Parameters
    ----------
    data : xr.DataArray
        Input data.
    min_value : float, optional
        Minimum allowed value.
    max_value : float, optional
        Maximum allowed value.

    Returns
    -------
    xr.DataArray
        Boolean mask (True = keep).

    """
    mask = xr.ones_like(data, dtype=bool)

    if min_value is not None:
        mask = mask & (data >= min_value)
    if max_value is not None:
        mask = mask & (data <= max_value)

    return mask

PercentileFilter

Bases: Filter

Filter values outside percentile range.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
class PercentileFilter(Filter):
    """Filter values outside percentile range."""

    def __init__(self) -> None:
        """Initialize the filter."""
        super().__init__("percentile")

    def compute_mask(
        self, data: xr.DataArray, lower: float = 5.0, upper: float = 95.0
    ) -> xr.DataArray:
        """Compute percentile mask.

        Parameters
        ----------
        data : xr.DataArray
            Input data.
        lower : float
            Lower percentile (0–100).
        upper : float
            Upper percentile (0–100).

        Returns
        -------
        xr.DataArray
            Boolean mask (True = keep).

        """
        lower_val = data.quantile(lower / 100.0, skipna=True)
        upper_val = data.quantile(upper / 100.0, skipna=True)

        mask = (data >= lower_val) & (data <= upper_val)

        return mask

__init__()

Initialize the filter.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
256
257
258
def __init__(self) -> None:
    """Initialize the filter."""
    super().__init__("percentile")

compute_mask(data, lower=5.0, upper=95.0)

Compute percentile mask.

Parameters

data : xr.DataArray Input data. lower : float Lower percentile (0–100). upper : float Upper percentile (0–100).

Returns

xr.DataArray Boolean mask (True = keep).

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
def compute_mask(
    self, data: xr.DataArray, lower: float = 5.0, upper: float = 95.0
) -> xr.DataArray:
    """Compute percentile mask.

    Parameters
    ----------
    data : xr.DataArray
        Input data.
    lower : float
        Lower percentile (0–100).
    upper : float
        Upper percentile (0–100).

    Returns
    -------
    xr.DataArray
        Boolean mask (True = keep).

    """
    lower_val = data.quantile(lower / 100.0, skipna=True)
    upper_val = data.quantile(upper / 100.0, skipna=True)

    mask = (data >= lower_val) & (data <= upper_val)

    return mask

CustomFilter

Bases: Filter

Apply a user-supplied callable as filter.

Parameters

name : str Filter identifier. func : callable (xr.DataArray, **kwargs) -> xr.DataArray returning a boolean mask.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
class CustomFilter(Filter):
    """Apply a user-supplied callable as filter.

    Parameters
    ----------
    name : str
        Filter identifier.
    func : callable
        ``(xr.DataArray, **kwargs) -> xr.DataArray`` returning a boolean mask.

    """

    def __init__(self, name: str, func: Callable[..., xr.DataArray]) -> None:
        """Initialize the custom filter.

        Parameters
        ----------
        name : str
            Filter identifier.
        func : Callable[..., xr.DataArray]
            Callable returning a boolean mask.

        """
        super().__init__(name)
        self.func = func

    def compute_mask(
        self,
        data: xr.DataArray,
        **kwargs: Any,
    ) -> xr.DataArray:
        """Apply custom function."""
        return self.func(data, **kwargs)

__init__(name, func)

Initialize the custom filter.

Parameters

name : str Filter identifier. func : Callable[..., xr.DataArray] Callable returning a boolean mask.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
300
301
302
303
304
305
306
307
308
309
310
311
312
def __init__(self, name: str, func: Callable[..., xr.DataArray]) -> None:
    """Initialize the custom filter.

    Parameters
    ----------
    name : str
        Filter identifier.
    func : Callable[..., xr.DataArray]
        Callable returning a boolean mask.

    """
    super().__init__(name)
    self.func = func

compute_mask(data, **kwargs)

Apply custom function.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
314
315
316
317
318
319
320
def compute_mask(
    self,
    data: xr.DataArray,
    **kwargs: Any,
) -> xr.DataArray:
    """Apply custom function."""
    return self.func(data, **kwargs)

FilterPipeline

Manage multiple filters applied sequentially or combined.

Non-destructive: creates new DataArrays, never modifies originals.

Parameters

ds : xr.Dataset Input dataset. var_name : str Variable to filter (default: 'VOD').

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
class FilterPipeline:
    """Manage multiple filters applied sequentially or combined.

    Non-destructive: creates new DataArrays, never modifies originals.

    Parameters
    ----------
    ds : xr.Dataset
        Input dataset.
    var_name : str
        Variable to filter (default: ``'VOD'``).

    """

    def __init__(self, ds: xr.Dataset, var_name: str = "VOD") -> None:
        """Initialize the filter pipeline.

        Parameters
        ----------
        ds : xr.Dataset
            Input dataset.
        var_name : str, default "VOD"
            Variable to filter.

        """
        self.ds = ds
        self.var_name = var_name
        self.filters: list[tuple[Filter, dict]] = []

    def add_filter(self, filter_obj: Filter | str, **kwargs: Any) -> FilterPipeline:
        """Add filter to pipeline.

        Parameters
        ----------
        filter_obj : Filter or str
            Filter instance or short name
            (``'zscore'``, ``'iqr'``, ``'range'``, ``'percentile'``).
        **kwargs
            Parameters forwarded to ``compute_mask``.

        Returns
        -------
        FilterPipeline
            Self (for chaining).

        """
        if isinstance(filter_obj, str):
            _filter_map = {
                "zscore": ZScoreFilter,
                "iqr": IQRFilter,
                "range": RangeFilter,
                "percentile": PercentileFilter,
            }
            if filter_obj not in _filter_map:
                raise ValueError(f"Unknown filter: {filter_obj}")
            filter_obj = _filter_map[filter_obj]()

        self.filters.append((filter_obj, kwargs))
        return self

    def apply(
        self, mode: str = "sequential", output_name: str | None = None
    ) -> xr.Dataset:
        """Apply all filters in the pipeline.

        Parameters
        ----------
        mode : {'sequential', 'combined'}
            ``'sequential'`` – masks accumulate (AND) after each filter;
            intermediate filtered variables are written.
            ``'combined'``   – all masks computed independently on the
            original data, then AND-ed once.
        output_name : str, optional
            Alias for the final filtered variable.

        Returns
        -------
        xr.Dataset
            Dataset with filtered variables appended.

        """
        if not self.filters:
            raise ValueError("No filters in pipeline")

        ds_out = self.ds.copy()

        if mode == "sequential":
            masks: list[xr.DataArray] = []
            filter_names: list[str] = []
            all_params: list[dict] = []

            for filter_obj, kwargs in self.filters:
                mask = filter_obj.compute_mask(ds_out[self.var_name], **kwargs)
                masks.append(mask)
                filter_names.append(filter_obj.name)
                all_params.append(kwargs)

                # Store individual mask
                mask_name = f"mask_{filter_obj.name}"
                if mask_name not in ds_out:
                    ds_out[mask_name] = mask
                    ds_out[mask_name].attrs = {
                        "filter_type": filter_obj.name,
                        **kwargs,
                    }

                # Accumulate masks (AND)
                cumulative_mask = masks[0]
                for m in masks[1:]:
                    cumulative_mask = cumulative_mask & m

                cumulative_suffix = "_".join(filter_names)
                filtered_data = ds_out[self.var_name].where(cumulative_mask)

                filtered_var_name = f"{self.var_name}_filtered_{cumulative_suffix}"
                cumulative_mask_name = f"mask_{cumulative_suffix}"

                ds_out[filtered_var_name] = filtered_data
                ds_out[cumulative_mask_name] = cumulative_mask

                n_total = int(cumulative_mask.size)
                n_removed = int((~cumulative_mask).sum().values)

                metadata: dict = {
                    "filter_chain": filter_names.copy(),
                    "mode": "sequential",
                    "applied_to": self.var_name,
                    "n_total": n_total,
                    "n_removed": n_removed,
                    "fraction_removed": float(n_removed / n_total),
                    "timestamp": datetime.now().isoformat(),
                    "filters": {
                        fname: params for fname, params in zip(filter_names, all_params)
                    },
                }

                ds_out[filtered_var_name].attrs = metadata
                ds_out[cumulative_mask_name].attrs = metadata

            if output_name:
                final_var = f"{self.var_name}_filtered_{'_'.join(filter_names)}"
                final_mask = f"mask_{'_'.join(filter_names)}"

                ds_out[f"{self.var_name}_filtered_{output_name}"] = ds_out[final_var]
                ds_out[f"mask_{output_name}"] = ds_out[final_mask]
                ds_out[f"{self.var_name}_filtered_{output_name}"].attrs = ds_out[
                    final_var
                ].attrs
                ds_out[f"mask_{output_name}"].attrs = ds_out[final_mask].attrs

        elif mode == "combined":
            masks = []
            filter_names = []
            all_params = []

            for filter_obj, kwargs in self.filters:
                mask = filter_obj.compute_mask(ds_out[self.var_name], **kwargs)
                masks.append(mask)
                filter_names.append(filter_obj.name)
                all_params.append(kwargs)

                mask_name = f"mask_{filter_obj.name}"
                if mask_name not in ds_out:
                    ds_out[mask_name] = mask
                    ds_out[mask_name].attrs = {
                        "filter_type": filter_obj.name,
                        **kwargs,
                    }

            combined_mask = masks[0]
            for mask in masks[1:]:
                combined_mask = combined_mask & mask

            suffix = output_name or "combined"
            filtered_data = ds_out[self.var_name].where(combined_mask)

            filtered_var_name = f"{self.var_name}_filtered_{suffix}"
            mask_var_name = f"mask_{suffix}"

            ds_out[filtered_var_name] = filtered_data
            ds_out[mask_var_name] = combined_mask

            n_total = int(combined_mask.size)
            n_removed = int((~combined_mask).sum().values)

            metadata = {
                "filter_chain": filter_names,
                "mode": "combined",
                "applied_to": self.var_name,
                "n_total": n_total,
                "n_removed": n_removed,
                "fraction_removed": float(n_removed / n_total),
                "timestamp": datetime.now().isoformat(),
                "filters": {
                    fname: params for fname, params in zip(filter_names, all_params)
                },
            }

            ds_out[filtered_var_name].attrs = metadata
            ds_out[mask_var_name].attrs = metadata

        else:
            raise ValueError(f"Unknown mode: {mode}")

        return ds_out

    def summary(self) -> str:
        """Return a human-readable summary of the pipeline."""
        lines = [f"Filter Pipeline for '{self.var_name}':", ""]
        for i, (filter_obj, kwargs) in enumerate(self.filters):
            lines.append(f"{i + 1}. {filter_obj.name}")
            for key, val in kwargs.items():
                lines.append(f"   - {key}: {val}")
        return "\n".join(lines)

__init__(ds, var_name='VOD')

Initialize the filter pipeline.

Parameters

ds : xr.Dataset Input dataset. var_name : str, default "VOD" Variable to filter.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
342
343
344
345
346
347
348
349
350
351
352
353
354
355
def __init__(self, ds: xr.Dataset, var_name: str = "VOD") -> None:
    """Initialize the filter pipeline.

    Parameters
    ----------
    ds : xr.Dataset
        Input dataset.
    var_name : str, default "VOD"
        Variable to filter.

    """
    self.ds = ds
    self.var_name = var_name
    self.filters: list[tuple[Filter, dict]] = []

add_filter(filter_obj, **kwargs)

Add filter to pipeline.

Parameters

filter_obj : Filter or str Filter instance or short name ('zscore', 'iqr', 'range', 'percentile'). **kwargs Parameters forwarded to compute_mask.

Returns

FilterPipeline Self (for chaining).

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
def add_filter(self, filter_obj: Filter | str, **kwargs: Any) -> FilterPipeline:
    """Add filter to pipeline.

    Parameters
    ----------
    filter_obj : Filter or str
        Filter instance or short name
        (``'zscore'``, ``'iqr'``, ``'range'``, ``'percentile'``).
    **kwargs
        Parameters forwarded to ``compute_mask``.

    Returns
    -------
    FilterPipeline
        Self (for chaining).

    """
    if isinstance(filter_obj, str):
        _filter_map = {
            "zscore": ZScoreFilter,
            "iqr": IQRFilter,
            "range": RangeFilter,
            "percentile": PercentileFilter,
        }
        if filter_obj not in _filter_map:
            raise ValueError(f"Unknown filter: {filter_obj}")
        filter_obj = _filter_map[filter_obj]()

    self.filters.append((filter_obj, kwargs))
    return self

apply(mode='sequential', output_name=None)

Apply all filters in the pipeline.

Parameters

mode : {'sequential', 'combined'} 'sequential' – masks accumulate (AND) after each filter; intermediate filtered variables are written. 'combined' – all masks computed independently on the original data, then AND-ed once. output_name : str, optional Alias for the final filtered variable.

Returns

xr.Dataset Dataset with filtered variables appended.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
def apply(
    self, mode: str = "sequential", output_name: str | None = None
) -> xr.Dataset:
    """Apply all filters in the pipeline.

    Parameters
    ----------
    mode : {'sequential', 'combined'}
        ``'sequential'`` – masks accumulate (AND) after each filter;
        intermediate filtered variables are written.
        ``'combined'``   – all masks computed independently on the
        original data, then AND-ed once.
    output_name : str, optional
        Alias for the final filtered variable.

    Returns
    -------
    xr.Dataset
        Dataset with filtered variables appended.

    """
    if not self.filters:
        raise ValueError("No filters in pipeline")

    ds_out = self.ds.copy()

    if mode == "sequential":
        masks: list[xr.DataArray] = []
        filter_names: list[str] = []
        all_params: list[dict] = []

        for filter_obj, kwargs in self.filters:
            mask = filter_obj.compute_mask(ds_out[self.var_name], **kwargs)
            masks.append(mask)
            filter_names.append(filter_obj.name)
            all_params.append(kwargs)

            # Store individual mask
            mask_name = f"mask_{filter_obj.name}"
            if mask_name not in ds_out:
                ds_out[mask_name] = mask
                ds_out[mask_name].attrs = {
                    "filter_type": filter_obj.name,
                    **kwargs,
                }

            # Accumulate masks (AND)
            cumulative_mask = masks[0]
            for m in masks[1:]:
                cumulative_mask = cumulative_mask & m

            cumulative_suffix = "_".join(filter_names)
            filtered_data = ds_out[self.var_name].where(cumulative_mask)

            filtered_var_name = f"{self.var_name}_filtered_{cumulative_suffix}"
            cumulative_mask_name = f"mask_{cumulative_suffix}"

            ds_out[filtered_var_name] = filtered_data
            ds_out[cumulative_mask_name] = cumulative_mask

            n_total = int(cumulative_mask.size)
            n_removed = int((~cumulative_mask).sum().values)

            metadata: dict = {
                "filter_chain": filter_names.copy(),
                "mode": "sequential",
                "applied_to": self.var_name,
                "n_total": n_total,
                "n_removed": n_removed,
                "fraction_removed": float(n_removed / n_total),
                "timestamp": datetime.now().isoformat(),
                "filters": {
                    fname: params for fname, params in zip(filter_names, all_params)
                },
            }

            ds_out[filtered_var_name].attrs = metadata
            ds_out[cumulative_mask_name].attrs = metadata

        if output_name:
            final_var = f"{self.var_name}_filtered_{'_'.join(filter_names)}"
            final_mask = f"mask_{'_'.join(filter_names)}"

            ds_out[f"{self.var_name}_filtered_{output_name}"] = ds_out[final_var]
            ds_out[f"mask_{output_name}"] = ds_out[final_mask]
            ds_out[f"{self.var_name}_filtered_{output_name}"].attrs = ds_out[
                final_var
            ].attrs
            ds_out[f"mask_{output_name}"].attrs = ds_out[final_mask].attrs

    elif mode == "combined":
        masks = []
        filter_names = []
        all_params = []

        for filter_obj, kwargs in self.filters:
            mask = filter_obj.compute_mask(ds_out[self.var_name], **kwargs)
            masks.append(mask)
            filter_names.append(filter_obj.name)
            all_params.append(kwargs)

            mask_name = f"mask_{filter_obj.name}"
            if mask_name not in ds_out:
                ds_out[mask_name] = mask
                ds_out[mask_name].attrs = {
                    "filter_type": filter_obj.name,
                    **kwargs,
                }

        combined_mask = masks[0]
        for mask in masks[1:]:
            combined_mask = combined_mask & mask

        suffix = output_name or "combined"
        filtered_data = ds_out[self.var_name].where(combined_mask)

        filtered_var_name = f"{self.var_name}_filtered_{suffix}"
        mask_var_name = f"mask_{suffix}"

        ds_out[filtered_var_name] = filtered_data
        ds_out[mask_var_name] = combined_mask

        n_total = int(combined_mask.size)
        n_removed = int((~combined_mask).sum().values)

        metadata = {
            "filter_chain": filter_names,
            "mode": "combined",
            "applied_to": self.var_name,
            "n_total": n_total,
            "n_removed": n_removed,
            "fraction_removed": float(n_removed / n_total),
            "timestamp": datetime.now().isoformat(),
            "filters": {
                fname: params for fname, params in zip(filter_names, all_params)
            },
        }

        ds_out[filtered_var_name].attrs = metadata
        ds_out[mask_var_name].attrs = metadata

    else:
        raise ValueError(f"Unknown mode: {mode}")

    return ds_out

summary()

Return a human-readable summary of the pipeline.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
534
535
536
537
538
539
540
541
def summary(self) -> str:
    """Return a human-readable summary of the pipeline."""
    lines = [f"Filter Pipeline for '{self.var_name}':", ""]
    for i, (filter_obj, kwargs) in enumerate(self.filters):
        lines.append(f"{i + 1}. {filter_obj.name}")
        for key, val in kwargs.items():
            lines.append(f"   - {key}: {val}")
    return "\n".join(lines)

create_zscore_filter(ds, var_name='VOD', threshold=3.0, suffix='zscore')

One-liner z-score filter.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
549
550
551
552
553
554
555
556
def create_zscore_filter(
    ds: xr.Dataset,
    var_name: str = "VOD",
    threshold: float = 3.0,
    suffix: str = "zscore",
) -> xr.Dataset:
    """One-liner z-score filter."""
    return ZScoreFilter().apply(ds, var_name, suffix, threshold=threshold)

create_range_filter(ds, var_name='VOD', min_value=None, max_value=None, suffix='range')

One-liner range filter.

Source code in packages/canvod-grids/src/canvod/grids/analysis/filtering.py
559
560
561
562
563
564
565
566
567
568
569
def create_range_filter(
    ds: xr.Dataset,
    var_name: str = "VOD",
    min_value: float | None = None,
    max_value: float | None = None,
    suffix: str = "range",
) -> xr.Dataset:
    """One-liner range filter."""
    return RangeFilter().apply(
        ds, var_name, suffix, min_value=min_value, max_value=max_value
    )

Per-Cell Filtering

Per-cell outlier filters for gridded VOD data.

Unlike the global filters in :mod:~canvod.grids.analysis.filtering, these operate independently on each grid cell, preserving spatial structure while removing temporal outliers within cells.

Classes

PerCellFilter – abstract base with auto cell-id detection. PerCellIQRFilter – per-cell IQR rejection. PerCellZScoreFilter – per-cell z-score rejection. PerCellRangeFilter – per-cell hard bounds. PerCellPercentileFilter – per-cell percentile bounds. PerCellFilterPipeline – sequential or combined multi-filter application.

Convenience functions

create_per_cell_iqr_filter – one-liner per-cell IQR. create_per_cell_zscore_filter – one-liner per-cell z-score.

PerCellFilter

Bases: ABC

Base class for per-cell filtering operations.

Sub-classes implement :meth:compute_cell_mask for a single cell's 1-D data array; the base class handles iteration over cells, auto-detection of the cell_id_* variable, and output assembly.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class PerCellFilter(ABC):
    """Base class for per-cell filtering operations.

    Sub-classes implement :meth:`compute_cell_mask` for a single cell's
    1-D data array; the base class handles iteration over cells,
    auto-detection of the ``cell_id_*`` variable, and output assembly.
    """

    def __init__(self, filter_name: str) -> None:
        """Initialize the per-cell filter.

        Parameters
        ----------
        filter_name : str
            Filter name.

        """
        self.filter_name = filter_name

    @abstractmethod
    def compute_cell_mask(
        self,
        cell_data: np.ndarray,
        **kwargs: Any,
    ) -> np.ndarray:
        """Return a boolean keep-mask for one cell's data.

        Parameters
        ----------
        cell_data : np.ndarray
            1-D array of values for one cell across time.

        Returns
        -------
        np.ndarray
            Boolean mask (True = keep).

        """
        ...

    def apply(
        self,
        ds: xr.Dataset,
        var_name: str = "VOD",
        cell_id_var: str | None = None,
        output_suffix: str | None = None,
        min_observations: int = 5,
        **kwargs: Any,
    ) -> xr.Dataset:
        """Apply per-cell filtering to *ds*.

        Parameters
        ----------
        ds : xr.Dataset
            Input dataset (must contain a ``cell_id_*`` variable).
        var_name : str
            Variable to filter.
        cell_id_var : str, optional
            Cell-ID variable name.  Auto-detected from ``cell_id_*`` if *None*.
        output_suffix : str, optional
            Suffix for output variables (default: ``filter_name``).
        min_observations : int
            Minimum observations per cell required for filtering.
        **kwargs
            Forwarded to :meth:`compute_cell_mask`.

        Returns
        -------
        xr.Dataset
            Copy of *ds* with ``<var>_filtered_<suffix>`` and
            ``mask_<suffix>`` appended.

        """
        if output_suffix is None:
            output_suffix = self.filter_name

        # Auto-detect cell_id variable
        if cell_id_var is None:
            cell_id_vars = [v for v in ds.data_vars if v.startswith("cell_id_")]
            if not cell_id_vars:
                raise ValueError("No cell_id variable found in dataset")
            cell_id_var = cell_id_vars[0]
            logger.info("Auto-detected cell_id variable: %s", cell_id_var)

        if var_name not in ds:
            raise ValueError(f"Variable '{var_name}' not found in dataset")
        if cell_id_var not in ds:
            raise ValueError(f"Cell ID variable '{cell_id_var}' not found in dataset")

        logger.info("Applying %s filter per-cell to '%s'", self.filter_name, var_name)

        var_data = ds[var_name]
        cell_ids = ds[cell_id_var]

        filtered_data, mask = self._apply_per_cell_filtering(
            var_data, cell_ids, min_observations, **kwargs
        )

        result = ds.copy()
        result[f"{var_name}_filtered_{output_suffix}"] = filtered_data
        result[f"mask_{output_suffix}"] = mask

        result[f"{var_name}_filtered_{output_suffix}"].attrs.update(
            {
                "filter_type": self.filter_name,
                "filter_params": str(kwargs),
                "min_observations": min_observations,
                "source_variable": var_name,
            }
        )

        return result

    def _apply_per_cell_filtering(
        self,
        var_data: xr.DataArray,
        cell_ids: xr.DataArray,
        min_observations: int,
        **kwargs: Any,
    ) -> tuple[xr.DataArray, xr.DataArray]:
        """Iterate over unique cells and apply the mask function.

        Parameters
        ----------
        var_data : xr.DataArray
            Data to filter.
        cell_ids : xr.DataArray
            Cell ID assignments.
        min_observations : int
            Minimum observations per cell.
        **kwargs : Any
            Forwarded to ``compute_cell_mask``.

        Returns
        -------
        Tuple[xr.DataArray, xr.DataArray]
            Filtered values and mask arrays.

        """
        values = var_data.values
        cells = cell_ids.values

        filtered_values = values.copy()
        mask_values = np.ones_like(values, dtype=bool)

        unique_cells = np.unique(cells[np.isfinite(cells)])
        logger.info("Processing %d unique cells", len(unique_cells))

        cells_processed = 0
        cells_filtered = 0
        total_filtered = 0

        for cell_id in unique_cells:
            cell_mask = (cells == cell_id) & np.isfinite(values) & np.isfinite(cells)
            cell_indices = np.where(cell_mask)

            if len(cell_indices[0]) < min_observations:
                continue

            cell_data = values[cell_mask]
            cell_filter_mask = self.compute_cell_mask(cell_data, **kwargs)

            mask_values[cell_mask] = cell_filter_mask
            filtered_values[cell_mask] = np.where(cell_filter_mask, cell_data, np.nan)

            cells_processed += 1
            if not np.all(cell_filter_mask):
                cells_filtered += 1
                total_filtered += int(np.sum(~cell_filter_mask))

        logger.info(
            "Processed %d cells, filtered %d cells, removed %d observations",
            cells_processed,
            cells_filtered,
            total_filtered,
        )

        filtered_da = xr.DataArray(
            filtered_values,
            dims=var_data.dims,
            coords=var_data.coords,
            attrs=var_data.attrs,
        )
        mask_da = xr.DataArray(mask_values, dims=var_data.dims, coords=var_data.coords)

        return filtered_da, mask_da

__init__(filter_name)

Initialize the per-cell filter.

Parameters

filter_name : str Filter name.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
47
48
49
50
51
52
53
54
55
56
def __init__(self, filter_name: str) -> None:
    """Initialize the per-cell filter.

    Parameters
    ----------
    filter_name : str
        Filter name.

    """
    self.filter_name = filter_name

compute_cell_mask(cell_data, **kwargs) abstractmethod

Return a boolean keep-mask for one cell's data.

Parameters

cell_data : np.ndarray 1-D array of values for one cell across time.

Returns

np.ndarray Boolean mask (True = keep).

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
@abstractmethod
def compute_cell_mask(
    self,
    cell_data: np.ndarray,
    **kwargs: Any,
) -> np.ndarray:
    """Return a boolean keep-mask for one cell's data.

    Parameters
    ----------
    cell_data : np.ndarray
        1-D array of values for one cell across time.

    Returns
    -------
    np.ndarray
        Boolean mask (True = keep).

    """
    ...

apply(ds, var_name='VOD', cell_id_var=None, output_suffix=None, min_observations=5, **kwargs)

Apply per-cell filtering to ds.

Parameters

ds : xr.Dataset Input dataset (must contain a cell_id_* variable). var_name : str Variable to filter. cell_id_var : str, optional Cell-ID variable name. Auto-detected from cell_id_* if None. output_suffix : str, optional Suffix for output variables (default: filter_name). min_observations : int Minimum observations per cell required for filtering. **kwargs Forwarded to :meth:compute_cell_mask.

Returns

xr.Dataset Copy of ds with <var>_filtered_<suffix> and mask_<suffix> appended.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def apply(
    self,
    ds: xr.Dataset,
    var_name: str = "VOD",
    cell_id_var: str | None = None,
    output_suffix: str | None = None,
    min_observations: int = 5,
    **kwargs: Any,
) -> xr.Dataset:
    """Apply per-cell filtering to *ds*.

    Parameters
    ----------
    ds : xr.Dataset
        Input dataset (must contain a ``cell_id_*`` variable).
    var_name : str
        Variable to filter.
    cell_id_var : str, optional
        Cell-ID variable name.  Auto-detected from ``cell_id_*`` if *None*.
    output_suffix : str, optional
        Suffix for output variables (default: ``filter_name``).
    min_observations : int
        Minimum observations per cell required for filtering.
    **kwargs
        Forwarded to :meth:`compute_cell_mask`.

    Returns
    -------
    xr.Dataset
        Copy of *ds* with ``<var>_filtered_<suffix>`` and
        ``mask_<suffix>`` appended.

    """
    if output_suffix is None:
        output_suffix = self.filter_name

    # Auto-detect cell_id variable
    if cell_id_var is None:
        cell_id_vars = [v for v in ds.data_vars if v.startswith("cell_id_")]
        if not cell_id_vars:
            raise ValueError("No cell_id variable found in dataset")
        cell_id_var = cell_id_vars[0]
        logger.info("Auto-detected cell_id variable: %s", cell_id_var)

    if var_name not in ds:
        raise ValueError(f"Variable '{var_name}' not found in dataset")
    if cell_id_var not in ds:
        raise ValueError(f"Cell ID variable '{cell_id_var}' not found in dataset")

    logger.info("Applying %s filter per-cell to '%s'", self.filter_name, var_name)

    var_data = ds[var_name]
    cell_ids = ds[cell_id_var]

    filtered_data, mask = self._apply_per_cell_filtering(
        var_data, cell_ids, min_observations, **kwargs
    )

    result = ds.copy()
    result[f"{var_name}_filtered_{output_suffix}"] = filtered_data
    result[f"mask_{output_suffix}"] = mask

    result[f"{var_name}_filtered_{output_suffix}"].attrs.update(
        {
            "filter_type": self.filter_name,
            "filter_params": str(kwargs),
            "min_observations": min_observations,
            "source_variable": var_name,
        }
    )

    return result

PerCellIQRFilter

Bases: PerCellFilter

Per-cell IQR outlier rejection.

Values outside [Q1 − factor·IQR, Q3 + factor·IQR] are removed independently within each cell.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
class PerCellIQRFilter(PerCellFilter):
    """Per-cell IQR outlier rejection.

    Values outside ``[Q1 − factor·IQR, Q3 + factor·IQR]`` are removed
    independently within each cell.
    """

    def __init__(self) -> None:
        """Initialize the filter."""
        super().__init__("iqr")

    def compute_cell_mask(
        self, cell_data: np.ndarray, factor: float = 1.5
    ) -> np.ndarray:
        """IQR mask for a single cell.

        Parameters
        ----------
        cell_data : np.ndarray
            1-D cell data.
        factor : float
            IQR multiplier (default 1.5).

        Returns
        -------
        np.ndarray
            Boolean mask.

        """
        valid_data = cell_data[np.isfinite(cell_data)]
        if len(valid_data) < 4:
            return np.ones_like(cell_data, dtype=bool)

        q1 = np.percentile(valid_data, 25)
        q3 = np.percentile(valid_data, 75)
        iqr = q3 - q1

        if iqr == 0:
            return np.ones_like(cell_data, dtype=bool)

        lower_bound = q1 - factor * iqr
        upper_bound = q3 + factor * iqr

        return (cell_data >= lower_bound) & (cell_data <= upper_bound)

__init__()

Initialize the filter.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
239
240
241
def __init__(self) -> None:
    """Initialize the filter."""
    super().__init__("iqr")

compute_cell_mask(cell_data, factor=1.5)

IQR mask for a single cell.

Parameters

cell_data : np.ndarray 1-D cell data. factor : float IQR multiplier (default 1.5).

Returns

np.ndarray Boolean mask.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
def compute_cell_mask(
    self, cell_data: np.ndarray, factor: float = 1.5
) -> np.ndarray:
    """IQR mask for a single cell.

    Parameters
    ----------
    cell_data : np.ndarray
        1-D cell data.
    factor : float
        IQR multiplier (default 1.5).

    Returns
    -------
    np.ndarray
        Boolean mask.

    """
    valid_data = cell_data[np.isfinite(cell_data)]
    if len(valid_data) < 4:
        return np.ones_like(cell_data, dtype=bool)

    q1 = np.percentile(valid_data, 25)
    q3 = np.percentile(valid_data, 75)
    iqr = q3 - q1

    if iqr == 0:
        return np.ones_like(cell_data, dtype=bool)

    lower_bound = q1 - factor * iqr
    upper_bound = q3 + factor * iqr

    return (cell_data >= lower_bound) & (cell_data <= upper_bound)

PerCellZScoreFilter

Bases: PerCellFilter

Per-cell z-score outlier rejection.

Values with |z| > threshold are removed independently within each cell.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
class PerCellZScoreFilter(PerCellFilter):
    """Per-cell z-score outlier rejection.

    Values with ``|z| > threshold`` are removed independently within each cell.
    """

    def __init__(self) -> None:
        """Initialize the filter."""
        super().__init__("zscore")

    def compute_cell_mask(
        self, cell_data: np.ndarray, threshold: float = 3.0
    ) -> np.ndarray:
        """Z-score mask for a single cell.

        Parameters
        ----------
        cell_data : np.ndarray
            1-D cell data.
        threshold : float
            Z-score threshold (default 3.0).

        Returns
        -------
        np.ndarray
            Boolean mask.

        """
        if len(cell_data) < 3:
            return np.ones_like(cell_data, dtype=bool)

        mean = np.nanmean(cell_data)
        std = np.nanstd(cell_data)

        if std == 0:
            return np.ones_like(cell_data, dtype=bool)

        z_scores = np.abs((cell_data - mean) / std)
        return z_scores <= threshold

__init__()

Initialize the filter.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
284
285
286
def __init__(self) -> None:
    """Initialize the filter."""
    super().__init__("zscore")

compute_cell_mask(cell_data, threshold=3.0)

Z-score mask for a single cell.

Parameters

cell_data : np.ndarray 1-D cell data. threshold : float Z-score threshold (default 3.0).

Returns

np.ndarray Boolean mask.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
def compute_cell_mask(
    self, cell_data: np.ndarray, threshold: float = 3.0
) -> np.ndarray:
    """Z-score mask for a single cell.

    Parameters
    ----------
    cell_data : np.ndarray
        1-D cell data.
    threshold : float
        Z-score threshold (default 3.0).

    Returns
    -------
    np.ndarray
        Boolean mask.

    """
    if len(cell_data) < 3:
        return np.ones_like(cell_data, dtype=bool)

    mean = np.nanmean(cell_data)
    std = np.nanstd(cell_data)

    if std == 0:
        return np.ones_like(cell_data, dtype=bool)

    z_scores = np.abs((cell_data - mean) / std)
    return z_scores <= threshold

PerCellRangeFilter

Bases: PerCellFilter

Per-cell hard-bound range filter.

Identical bounds applied to every cell.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
class PerCellRangeFilter(PerCellFilter):
    """Per-cell hard-bound range filter.

    Identical bounds applied to every cell.
    """

    def __init__(self) -> None:
        """Initialize the filter."""
        super().__init__("range")

    def compute_cell_mask(
        self,
        cell_data: np.ndarray,
        min_value: float | None = None,
        max_value: float | None = None,
    ) -> np.ndarray:
        """Range mask for a single cell.

        Parameters
        ----------
        cell_data : np.ndarray
            1-D cell data.
        min_value : float, optional
            Minimum allowed value.
        max_value : float, optional
            Maximum allowed value.

        Returns
        -------
        np.ndarray
            Boolean mask.

        """
        mask = np.ones_like(cell_data, dtype=bool)
        if min_value is not None:
            mask = mask & (cell_data >= min_value)
        if max_value is not None:
            mask = mask & (cell_data <= max_value)
        return mask

__init__()

Initialize the filter.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
325
326
327
def __init__(self) -> None:
    """Initialize the filter."""
    super().__init__("range")

compute_cell_mask(cell_data, min_value=None, max_value=None)

Range mask for a single cell.

Parameters

cell_data : np.ndarray 1-D cell data. min_value : float, optional Minimum allowed value. max_value : float, optional Maximum allowed value.

Returns

np.ndarray Boolean mask.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
def compute_cell_mask(
    self,
    cell_data: np.ndarray,
    min_value: float | None = None,
    max_value: float | None = None,
) -> np.ndarray:
    """Range mask for a single cell.

    Parameters
    ----------
    cell_data : np.ndarray
        1-D cell data.
    min_value : float, optional
        Minimum allowed value.
    max_value : float, optional
        Maximum allowed value.

    Returns
    -------
    np.ndarray
        Boolean mask.

    """
    mask = np.ones_like(cell_data, dtype=bool)
    if min_value is not None:
        mask = mask & (cell_data >= min_value)
    if max_value is not None:
        mask = mask & (cell_data <= max_value)
    return mask

PerCellPercentileFilter

Bases: PerCellFilter

Per-cell percentile-bound filter.

Bounds are computed independently within each cell.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
class PerCellPercentileFilter(PerCellFilter):
    """Per-cell percentile-bound filter.

    Bounds are computed independently within each cell.
    """

    def __init__(self) -> None:
        """Initialize the filter."""
        super().__init__("percentile")

    def compute_cell_mask(
        self, cell_data: np.ndarray, lower: float = 5.0, upper: float = 95.0
    ) -> np.ndarray:
        """Percentile mask for a single cell.

        Parameters
        ----------
        cell_data : np.ndarray
            1-D cell data.
        lower : float
            Lower percentile (0–100).
        upper : float
            Upper percentile (0–100).

        Returns
        -------
        np.ndarray
            Boolean mask.

        """
        valid_data = cell_data[np.isfinite(cell_data)]
        if len(valid_data) < 5:
            return np.ones_like(cell_data, dtype=bool)

        lower_val = np.percentile(valid_data, lower)
        upper_val = np.percentile(valid_data, upper)

        return (cell_data >= lower_val) & (cell_data <= upper_val)

__init__()

Initialize the filter.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
366
367
368
def __init__(self) -> None:
    """Initialize the filter."""
    super().__init__("percentile")

compute_cell_mask(cell_data, lower=5.0, upper=95.0)

Percentile mask for a single cell.

Parameters

cell_data : np.ndarray 1-D cell data. lower : float Lower percentile (0–100). upper : float Upper percentile (0–100).

Returns

np.ndarray Boolean mask.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
def compute_cell_mask(
    self, cell_data: np.ndarray, lower: float = 5.0, upper: float = 95.0
) -> np.ndarray:
    """Percentile mask for a single cell.

    Parameters
    ----------
    cell_data : np.ndarray
        1-D cell data.
    lower : float
        Lower percentile (0–100).
    upper : float
        Upper percentile (0–100).

    Returns
    -------
    np.ndarray
        Boolean mask.

    """
    valid_data = cell_data[np.isfinite(cell_data)]
    if len(valid_data) < 5:
        return np.ones_like(cell_data, dtype=bool)

    lower_val = np.percentile(valid_data, lower)
    upper_val = np.percentile(valid_data, upper)

    return (cell_data >= lower_val) & (cell_data <= upper_val)

PerCellFilterPipeline

Sequential or combined multi-filter pipeline for per-cell filtering.

Parameters

ds : xr.Dataset Input dataset. var_name : str Variable to filter (default: 'VOD').

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
class PerCellFilterPipeline:
    """Sequential or combined multi-filter pipeline for per-cell filtering.

    Parameters
    ----------
    ds : xr.Dataset
        Input dataset.
    var_name : str
        Variable to filter (default: ``'VOD'``).

    """

    def __init__(self, ds: xr.Dataset, var_name: str = "VOD") -> None:
        """Initialize the filter pipeline.

        Parameters
        ----------
        ds : xr.Dataset
            Input dataset.
        var_name : str, default "VOD"
            Variable to filter.

        """
        self.ds = ds
        self.var_name = var_name
        self.filters: list[tuple[PerCellFilter, dict]] = []

    def add_filter(
        self,
        filter_obj: PerCellFilter | str,
        **kwargs: Any,
    ) -> PerCellFilterPipeline:
        """Add a filter.

        Parameters
        ----------
        filter_obj : PerCellFilter or str
            Filter instance or short name
            (``'iqr'``, ``'zscore'``, ``'range'``, ``'percentile'``).
        **kwargs
            Parameters forwarded to the filter.

        Returns
        -------
        PerCellFilterPipeline
            Self (for chaining).

        """
        if isinstance(filter_obj, str):
            _filter_map = {
                "iqr": PerCellIQRFilter,
                "zscore": PerCellZScoreFilter,
                "range": PerCellRangeFilter,
                "percentile": PerCellPercentileFilter,
            }
            if filter_obj not in _filter_map:
                raise ValueError(f"Unknown filter: {filter_obj}")
            filter_obj = _filter_map[filter_obj]()

        self.filters.append((filter_obj, kwargs))
        return self

    def apply(
        self, mode: str = "sequential", output_name: str | None = None
    ) -> xr.Dataset:
        """Apply all filters.

        Parameters
        ----------
        mode : {'sequential', 'combined'}
            ``'sequential'`` – each filter operates on the previous output.
            ``'combined'``   – all masks computed on the original, then AND-ed.
        output_name : str, optional
            Alias for the final filtered variable.

        Returns
        -------
        xr.Dataset
            Dataset with filtered results.

        """
        if not self.filters:
            raise ValueError("No filters added to pipeline")

        result = self.ds.copy()
        current_var = self.var_name

        if mode == "sequential":
            for i, (filter_obj, kwargs) in enumerate(self.filters):
                suffix = (
                    f"{filter_obj.filter_name}_{i}" if i > 0 else filter_obj.filter_name
                )
                result = filter_obj.apply(
                    result, current_var, output_suffix=suffix, **kwargs
                )
                current_var = f"{self.var_name}_filtered_{suffix}"

        elif mode == "combined":
            combined_mask = None
            filter_names: list[str] = []

            for filter_obj, kwargs in self.filters:
                filtered = filter_obj.apply(result, self.var_name, **kwargs)
                mask = filtered[f"mask_{filter_obj.filter_name}"]

                if combined_mask is None:
                    combined_mask = mask
                else:
                    combined_mask = combined_mask & mask

                filter_names.append(filter_obj.filter_name)

            final_name = output_name or "_".join(filter_names)
            result[f"{self.var_name}_filtered_{final_name}"] = result[
                self.var_name
            ].where(combined_mask)
            result[f"mask_{final_name}"] = combined_mask

        else:
            raise ValueError(f"Unknown mode: {mode}")

        return result

__init__(ds, var_name='VOD')

Initialize the filter pipeline.

Parameters

ds : xr.Dataset Input dataset. var_name : str, default "VOD" Variable to filter.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
417
418
419
420
421
422
423
424
425
426
427
428
429
430
def __init__(self, ds: xr.Dataset, var_name: str = "VOD") -> None:
    """Initialize the filter pipeline.

    Parameters
    ----------
    ds : xr.Dataset
        Input dataset.
    var_name : str, default "VOD"
        Variable to filter.

    """
    self.ds = ds
    self.var_name = var_name
    self.filters: list[tuple[PerCellFilter, dict]] = []

add_filter(filter_obj, **kwargs)

Add a filter.

Parameters

filter_obj : PerCellFilter or str Filter instance or short name ('iqr', 'zscore', 'range', 'percentile'). **kwargs Parameters forwarded to the filter.

Returns

PerCellFilterPipeline Self (for chaining).

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
def add_filter(
    self,
    filter_obj: PerCellFilter | str,
    **kwargs: Any,
) -> PerCellFilterPipeline:
    """Add a filter.

    Parameters
    ----------
    filter_obj : PerCellFilter or str
        Filter instance or short name
        (``'iqr'``, ``'zscore'``, ``'range'``, ``'percentile'``).
    **kwargs
        Parameters forwarded to the filter.

    Returns
    -------
    PerCellFilterPipeline
        Self (for chaining).

    """
    if isinstance(filter_obj, str):
        _filter_map = {
            "iqr": PerCellIQRFilter,
            "zscore": PerCellZScoreFilter,
            "range": PerCellRangeFilter,
            "percentile": PerCellPercentileFilter,
        }
        if filter_obj not in _filter_map:
            raise ValueError(f"Unknown filter: {filter_obj}")
        filter_obj = _filter_map[filter_obj]()

    self.filters.append((filter_obj, kwargs))
    return self

apply(mode='sequential', output_name=None)

Apply all filters.

Parameters

mode : {'sequential', 'combined'} 'sequential' – each filter operates on the previous output. 'combined' – all masks computed on the original, then AND-ed. output_name : str, optional Alias for the final filtered variable.

Returns

xr.Dataset Dataset with filtered results.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
def apply(
    self, mode: str = "sequential", output_name: str | None = None
) -> xr.Dataset:
    """Apply all filters.

    Parameters
    ----------
    mode : {'sequential', 'combined'}
        ``'sequential'`` – each filter operates on the previous output.
        ``'combined'``   – all masks computed on the original, then AND-ed.
    output_name : str, optional
        Alias for the final filtered variable.

    Returns
    -------
    xr.Dataset
        Dataset with filtered results.

    """
    if not self.filters:
        raise ValueError("No filters added to pipeline")

    result = self.ds.copy()
    current_var = self.var_name

    if mode == "sequential":
        for i, (filter_obj, kwargs) in enumerate(self.filters):
            suffix = (
                f"{filter_obj.filter_name}_{i}" if i > 0 else filter_obj.filter_name
            )
            result = filter_obj.apply(
                result, current_var, output_suffix=suffix, **kwargs
            )
            current_var = f"{self.var_name}_filtered_{suffix}"

    elif mode == "combined":
        combined_mask = None
        filter_names: list[str] = []

        for filter_obj, kwargs in self.filters:
            filtered = filter_obj.apply(result, self.var_name, **kwargs)
            mask = filtered[f"mask_{filter_obj.filter_name}"]

            if combined_mask is None:
                combined_mask = mask
            else:
                combined_mask = combined_mask & mask

            filter_names.append(filter_obj.filter_name)

        final_name = output_name or "_".join(filter_names)
        result[f"{self.var_name}_filtered_{final_name}"] = result[
            self.var_name
        ].where(combined_mask)
        result[f"mask_{final_name}"] = combined_mask

    else:
        raise ValueError(f"Unknown mode: {mode}")

    return result

create_per_cell_iqr_filter(ds, var_name='VOD', factor=1.5, cell_id_var=None, min_observations=5)

One-liner per-cell IQR filter.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
def create_per_cell_iqr_filter(
    ds: xr.Dataset,
    var_name: str = "VOD",
    factor: float = 1.5,
    cell_id_var: str | None = None,
    min_observations: int = 5,
) -> xr.Dataset:
    """One-liner per-cell IQR filter."""
    return PerCellIQRFilter().apply(
        ds,
        var_name,
        cell_id_var=cell_id_var,
        factor=factor,
        min_observations=min_observations,
    )

create_per_cell_zscore_filter(ds, var_name='VOD', threshold=3.0, cell_id_var=None, min_observations=5)

One-liner per-cell z-score filter.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_filtering.py
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
def create_per_cell_zscore_filter(
    ds: xr.Dataset,
    var_name: str = "VOD",
    threshold: float = 3.0,
    cell_id_var: str | None = None,
    min_observations: int = 5,
) -> xr.Dataset:
    """One-liner per-cell z-score filter."""
    return PerCellZScoreFilter().apply(
        ds,
        var_name,
        cell_id_var=cell_id_var,
        threshold=threshold,
        min_observations=min_observations,
    )

Hampel Filtering

Parallelized Hampel filtering for gridded VOD data.

Spatial-batch multiprocessing Hampel filter with complete temporal coverage (no temporal chunking). Each (cell_id, SID) time series is filtered independently using median absolute deviation (MAD).

Functions

process_spatial_batch_worker – picklable worker for one spatial batch. hampel_cell_sid_parallelized – main entry point (no temporal aggregation). aggr_hampel_cell_sid_parallelized – with optional temporal aggregation.

Notes

  • Worker function is module-level so it can be pickled by multiprocessing.Pool.
  • Default spatial batch size is 500 cells; tune based on available memory.
  • Expected throughput: 300–700 K cell-SID combinations / s on a typical multi-core machine.

process_spatial_batch_worker(args)

Process a single spatial batch for one SID.

Designed to be pickled and dispatched to worker processes. Returns compact index lists to minimise IPC memory transfer.

Parameters

args : tuple (batch_cells, vod_values_sid, cell_ids_sid, valid_indices, threshold, min_obs_per_sid, batch_idx, sid_idx)

batch_cells : np.ndarray
    Cell IDs in this batch.
vod_values_sid : np.ndarray
    VOD values for the current SID (valid entries only).
cell_ids_sid : np.ndarray
    Corresponding cell IDs (same length as *vod_values_sid*).
valid_indices : np.ndarray
    Original epoch indices of the valid entries.
threshold : float
    MAD threshold for outlier detection.
min_obs_per_sid : int
    Minimum observations required per cell to run filter.
batch_idx : int
    Batch index (for bookkeeping).
sid_idx : int
    SID index (for bookkeeping).

Returns

dict Keys: batch_idx, sid_idx, outlier_indices, processing_indices, combinations, filtered.

Source code in packages/canvod-grids/src/canvod/grids/analysis/hampel_filtering.py
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def process_spatial_batch_worker(args: tuple) -> dict:
    """Process a single spatial batch for one SID.

    Designed to be pickled and dispatched to worker processes.
    Returns compact index lists to minimise IPC memory transfer.

    Parameters
    ----------
    args : tuple
        ``(batch_cells, vod_values_sid, cell_ids_sid, valid_indices,
        threshold, min_obs_per_sid, batch_idx, sid_idx)``

        batch_cells : np.ndarray
            Cell IDs in this batch.
        vod_values_sid : np.ndarray
            VOD values for the current SID (valid entries only).
        cell_ids_sid : np.ndarray
            Corresponding cell IDs (same length as *vod_values_sid*).
        valid_indices : np.ndarray
            Original epoch indices of the valid entries.
        threshold : float
            MAD threshold for outlier detection.
        min_obs_per_sid : int
            Minimum observations required per cell to run filter.
        batch_idx : int
            Batch index (for bookkeeping).
        sid_idx : int
            SID index (for bookkeeping).

    Returns
    -------
    dict
        Keys: ``batch_idx``, ``sid_idx``, ``outlier_indices``,
        ``processing_indices``, ``combinations``, ``filtered``.

    """
    (
        batch_cells,
        vod_values_sid,
        cell_ids_sid,
        valid_indices,
        threshold,
        min_obs_per_sid,
        batch_idx,
        sid_idx,
    ) = args

    batch_outlier_indices: list[int] = []
    batch_processing_indices: list[int] = []
    batch_combinations = 0
    batch_filtered = 0

    for cell_id in batch_cells:
        batch_combinations += 1

        cell_mask = cell_ids_sid == cell_id
        if not np.any(cell_mask):
            continue

        cell_vod = vod_values_sid[cell_mask]
        cell_indices = valid_indices[cell_mask]

        if len(cell_vod) < min_obs_per_sid:
            continue

        batch_processing_indices.extend(cell_indices)
        batch_filtered += 1

        median_val = np.median(cell_vod)
        mad_val = np.median(np.abs(cell_vod - median_val))

        if mad_val > 0:
            outliers = np.abs(cell_vod - median_val) > threshold * mad_val
            if np.any(outliers):
                batch_outlier_indices.extend(cell_indices[outliers])

    return {
        "batch_idx": batch_idx,
        "sid_idx": sid_idx,
        "outlier_indices": batch_outlier_indices,
        "processing_indices": batch_processing_indices,
        "combinations": batch_combinations,
        "filtered": batch_filtered,
    }

hampel_cell_sid_parallelized(vod_ds, grid_name='equal_area_2deg', threshold=3.0, min_obs_per_sid=20, spatial_batch_size=500, n_workers=None)

Parallelized cell–SID Hampel filter with complete temporal coverage.

Each (cell_id, SID) time series is filtered independently using global (non-chunked) statistics. Spatial work is distributed across n_workers processes.

Parameters

vod_ds : xr.Dataset Input dataset containing 'VOD' and a cell_id_<grid_name> variable, with dimensions (epoch, sid). grid_name : str Grid identifier used to locate the cell-ID variable, e.g. 'equal_area_2deg'. threshold : float MAD multiplier for outlier detection. min_obs_per_sid : int Minimum valid observations per cell-SID to run the filter. spatial_batch_size : int Number of cells per parallel batch. n_workers : int or None Number of worker processes. Defaults to min(cpu_count(), 8).

Returns

xr.Dataset Copy of vod_ds with additional variables:

* ``VOD_filtered_hampel`` – filtered VOD (outliers set to NaN).
* ``hampel_processing_mask`` – boolean mask of processed
  observations.

Dataset-level attrs include full processing metadata.
Source code in packages/canvod-grids/src/canvod/grids/analysis/hampel_filtering.py
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
def hampel_cell_sid_parallelized(
    vod_ds: xr.Dataset,
    grid_name: str = "equal_area_2deg",
    threshold: float = 3.0,
    min_obs_per_sid: int = 20,
    spatial_batch_size: int = 500,
    n_workers: int | None = None,
) -> xr.Dataset:
    """Parallelized cell–SID Hampel filter with complete temporal coverage.

    Each (cell_id, SID) time series is filtered independently using
    global (non-chunked) statistics.  Spatial work is distributed
    across *n_workers* processes.

    Parameters
    ----------
    vod_ds : xr.Dataset
        Input dataset containing ``'VOD'`` and a ``cell_id_<grid_name>``
        variable, with dimensions ``(epoch, sid)``.
    grid_name : str
        Grid identifier used to locate the cell-ID variable, e.g.
        ``'equal_area_2deg'``.
    threshold : float
        MAD multiplier for outlier detection.
    min_obs_per_sid : int
        Minimum valid observations per cell-SID to run the filter.
    spatial_batch_size : int
        Number of cells per parallel batch.
    n_workers : int or None
        Number of worker processes.  Defaults to ``min(cpu_count(), 8)``.

    Returns
    -------
    xr.Dataset
        Copy of *vod_ds* with additional variables:

        * ``VOD_filtered_hampel`` – filtered VOD (outliers set to NaN).
        * ``hampel_processing_mask`` – boolean mask of processed
          observations.

        Dataset-level attrs include full processing metadata.

    """
    if n_workers is None:
        n_workers = min(cpu_count(), 8)

    cell_id_var = f"cell_id_{grid_name}"
    n_epochs, n_sids = vod_ds.VOD.shape
    cell_ids = vod_ds[cell_id_var].values
    vod_values = vod_ds.VOD.values

    logger.info(
        "Parallelized Hampel filter: shape=%s, threshold=%.1f, "
        "min_obs=%d, workers=%d, batch_size=%d",
        vod_ds.VOD.shape,
        threshold,
        min_obs_per_sid,
        n_workers,
        spatial_batch_size,
    )

    # Unique cells and spatial batches
    unique_cells = np.unique(cell_ids[np.isfinite(cell_ids)])
    n_spatial_batches = int(np.ceil(len(unique_cells) / spatial_batch_size))

    cell_batches = [
        (unique_cells[i : i + spatial_batch_size], i // spatial_batch_size)
        for i in range(0, len(unique_cells), spatial_batch_size)
    ]

    # Result arrays
    outlier_mask = np.zeros((n_epochs, n_sids), dtype=bool)
    processing_mask = np.zeros((n_epochs, n_sids), dtype=bool)

    total_combinations_processed = 0
    total_combinations_filtered = 0
    overall_start = time.time()

    for sid_idx in range(n_sids):
        if sid_idx % 25 == 0:
            elapsed = time.time() - overall_start
            if sid_idx > 0 and elapsed > 0:
                rate = total_combinations_processed / elapsed
                logger.debug(
                    "SID %d/%d | rate=%.0f combinations/s",
                    sid_idx + 1,
                    n_sids,
                    rate,
                )

        valid_mask = np.isfinite(vod_values[:, sid_idx]) & np.isfinite(
            cell_ids[:, sid_idx]
        )
        if not np.any(valid_mask):
            total_combinations_processed += len(unique_cells)
            continue

        valid_indices = np.where(valid_mask)[0]
        sid_cells = cell_ids[valid_mask, sid_idx]
        sid_vod = vod_values[valid_mask, sid_idx]

        batch_args = [
            (
                batch_cells,
                sid_vod,
                sid_cells,
                valid_indices,
                threshold,
                min_obs_per_sid,
                batch_idx,
                sid_idx,
            )
            for batch_cells, batch_idx in cell_batches
        ]

        with Pool(n_workers) as pool:
            batch_results = pool.map(process_spatial_batch_worker, batch_args)

        for result in batch_results:
            if result["outlier_indices"]:
                outlier_mask[result["outlier_indices"], sid_idx] = True
            if result["processing_indices"]:
                processing_mask[result["processing_indices"], sid_idx] = True
            total_combinations_processed += result["combinations"]
            total_combinations_filtered += result["filtered"]

    total_time = time.time() - overall_start

    # Build result dataset
    vod_filtered = vod_values.copy()
    vod_filtered[outlier_mask] = np.nan

    result_ds = vod_ds.copy()
    result_ds["VOD_filtered_hampel"] = (["epoch", "sid"], vod_filtered)
    result_ds["hampel_processing_mask"] = (["epoch", "sid"], processing_mask)

    # Statistics for logging
    original_valid = int(np.sum(np.isfinite(vod_values)))
    outliers_removed = int(np.sum(outlier_mask))
    outlier_pct = outliers_removed / original_valid * 100 if original_valid > 0 else 0.0

    logger.info(
        "Hampel complete: time=%.1fs, rate=%.0f comb/s, "
        "outliers=%d (%.2f%%), processed=%d combinations",
        total_time,
        total_combinations_processed / total_time if total_time > 0 else 0,
        outliers_removed,
        outlier_pct,
        total_combinations_processed,
    )

    # Metadata
    result_ds.attrs.update(
        {
            "hampel_filtering": "parallelized_complete_temporal",
            "hampel_threshold": threshold,
            "min_obs_per_sid": min_obs_per_sid,
            "spatial_batch_size": spatial_batch_size,
            "n_workers": n_workers,
            "spatial_batches": n_spatial_batches,
            "temporal_chunking": "none",
            "temporal_coverage": "complete",
            "processing_time_seconds": total_time,
            "processing_rate_combinations_per_second": (
                total_combinations_processed / total_time if total_time > 0 else 0
            ),
            "combinations_processed": total_combinations_processed,
            "combinations_filtered": total_combinations_filtered,
            "parallel_efficiency": (
                total_combinations_processed / (n_workers * total_time)
                if total_time > 0
                else 0
            ),
            "outliers_removed": outliers_removed,
            "outlier_percentage": outlier_pct,
            "scientific_validity": "complete_temporal_continuity",
            "parallelization_method": "multiprocessing_spatial_batches",
        }
    )

    result_ds["VOD_filtered_hampel"].attrs.update(
        {
            "long_name": (
                "VOD filtered with parallelized complete temporal Hampel method"
            ),
            "method": "global_statistics_per_cell_sid",
            "temporal_coverage": "complete_no_chunking",
            "parallelization": f"{n_workers}_workers_spatial_batching",
        }
    )

    result_ds["hampel_processing_mask"].attrs.update(
        {
            "long_name": "Mask of observations processed by parallel Hampel filter",
            "temporal_coverage": "complete",
            "processing_method": "parallel_spatial_batches",
        }
    )

    return result_ds

aggr_hampel_cell_sid_parallelized(vod_ds, grid_name='equal_area_2deg', threshold=3.0, min_obs_per_sid=20, spatial_batch_size=500, n_workers=None, temporal_agg=None, agg_method='mean')

Parallelized cell–SID Hampel filter with optional temporal aggregation.

Each (cell_id, SID) series is filtered independently. When temporal_agg is set, data is first binned into temporal windows before filtering; φ, θ and cell IDs are reassigned consistently on the output.

Parameters

vod_ds : xr.Dataset Input dataset with 'VOD', 'phi', 'theta' and a cell_id_<grid_name> variable. grid_name : str Grid identifier (e.g. 'equal_area_2deg'). Used to locate or create the cell-ID variable. threshold : float MAD multiplier. min_obs_per_sid : int Minimum valid observations per cell-SID. spatial_batch_size : int Cells per parallel batch. n_workers : int or None Worker count (defaults to min(cpu_count(), 8)). temporal_agg : str or None Temporal aggregation frequency (e.g. '1H', '1D'). None → no aggregation. agg_method : str 'mean' or 'median' for temporal aggregation.

Returns

xr.Dataset Dataset with 'VOD' (aggregated raw), 'VOD_filtered_hampel', 'hampel_outlier_mask', 'hampel_processing_mask', reassigned phi, theta and cell_id_<grid_name>.

Source code in packages/canvod-grids/src/canvod/grids/analysis/hampel_filtering.py
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
def aggr_hampel_cell_sid_parallelized(
    vod_ds: xr.Dataset,
    grid_name: str = "equal_area_2deg",
    threshold: float = 3.0,
    min_obs_per_sid: int = 20,
    spatial_batch_size: int = 500,
    n_workers: int | None = None,
    temporal_agg: str | None = None,
    agg_method: str = "mean",
) -> xr.Dataset:
    """Parallelized cell–SID Hampel filter with optional temporal aggregation.

    Each (cell_id, SID) series is filtered independently.  When
    *temporal_agg* is set, data is first binned into temporal windows
    before filtering; φ, θ and cell IDs are reassigned consistently on
    the output.

    Parameters
    ----------
    vod_ds : xr.Dataset
        Input dataset with ``'VOD'``, ``'phi'``, ``'theta'`` and a
        ``cell_id_<grid_name>`` variable.
    grid_name : str
        Grid identifier (e.g. ``'equal_area_2deg'``).  Used to locate
        or create the cell-ID variable.
    threshold : float
        MAD multiplier.
    min_obs_per_sid : int
        Minimum valid observations per cell-SID.
    spatial_batch_size : int
        Cells per parallel batch.
    n_workers : int or None
        Worker count (defaults to ``min(cpu_count(), 8)``).
    temporal_agg : str or None
        Temporal aggregation frequency (e.g. ``'1H'``, ``'1D'``).
        ``None`` → no aggregation.
    agg_method : str
        ``'mean'`` or ``'median'`` for temporal aggregation.

    Returns
    -------
    xr.Dataset
        Dataset with ``'VOD'`` (aggregated raw), ``'VOD_filtered_hampel'``,
        ``'hampel_outlier_mask'``, ``'hampel_processing_mask'``,
        reassigned ``phi``, ``theta`` and ``cell_id_<grid_name>``.

    """
    if n_workers is None:
        n_workers = min(cpu_count(), 8)

    # --- Parse grid_name → (grid_type, resolution) ---
    parts = grid_name.split("_")
    if not parts[-1].endswith("deg"):
        raise ValueError(f"Grid name '{grid_name}' must end with '<N>deg'")
    try:
        resolution = float(parts[-1].replace("deg", ""))
    except ValueError:
        raise ValueError(f"Could not parse resolution from grid name '{grid_name}'")
    grid_type = "_".join(parts[:-1])

    grid = create_hemigrid(angular_resolution=resolution, grid_type=grid_type)

    cell_id_var = f"cell_id_{grid_name}"
    if cell_id_var not in vod_ds:
        logger.info("No '%s' in input dataset — assigning now.", cell_id_var)
        vod_ds = add_cell_ids_to_vod_fast(vod_ds, grid=grid, grid_name=grid_name)

    vod_values = vod_ds["VOD"].values
    cell_ids = vod_ds[cell_id_var].values
    n_epochs, n_sids = vod_values.shape

    unique_cells = np.unique(cell_ids[np.isfinite(cell_ids)])
    cell_batches = [
        (unique_cells[i : i + spatial_batch_size], i // spatial_batch_size)
        for i in range(0, len(unique_cells), spatial_batch_size)
    ]

    logger.info(
        "aggr_hampel: shape=%s, threshold=%.1f, workers=%d, temporal_agg=%s",
        vod_ds.VOD.shape,
        threshold,
        n_workers,
        temporal_agg,
    )

    # --- Epoch grid ---
    if temporal_agg:
        new_epochs = _compute_time_bins(vod_ds["epoch"].values, temporal_agg)
        logger.info("Temporal aggregation → %d epoch bins", len(new_epochs))
    else:
        new_epochs = vod_ds["epoch"].values

    # --- Preallocate ---
    vod_filtered = np.full((len(new_epochs), n_sids), np.nan)
    vod_agg = np.full((len(new_epochs), n_sids), np.nan)
    outlier_mask = np.zeros((len(new_epochs), n_sids), dtype=bool)
    processing_mask = np.zeros((len(new_epochs), n_sids), dtype=bool)

    total_combinations_processed = 0
    total_combinations_filtered = 0
    start_time = time.time()

    for sid_idx in range(n_sids):
        valid_mask = np.isfinite(vod_values[:, sid_idx]) & np.isfinite(
            cell_ids[:, sid_idx]
        )
        if not np.any(valid_mask):
            continue

        sid_cells = cell_ids[valid_mask, sid_idx]
        sid_vod = vod_values[valid_mask, sid_idx]
        valid_times = vod_ds["epoch"].values[valid_mask]

        if temporal_agg:
            valid_times, sid_cells, sid_vod = _aggregate_temporally(
                valid_times, sid_cells, sid_vod, temporal_agg, agg_method
            )

        # Parallel filtering
        batch_args = [
            (
                batch_cells,
                sid_vod,
                sid_cells,
                np.arange(len(valid_times)),
                threshold,
                min_obs_per_sid,
                batch_idx,
                sid_idx,
            )
            for batch_cells, batch_idx in cell_batches
        ]

        with Pool(n_workers) as pool:
            batch_results = pool.map(process_spatial_batch_worker, batch_args)

        for res in batch_results:
            if res["outlier_indices"]:
                t_idx = np.searchsorted(
                    new_epochs,
                    valid_times[res["outlier_indices"]],
                )
                outlier_mask[t_idx, sid_idx] = True
            if res["processing_indices"]:
                t_idx = np.searchsorted(
                    new_epochs,
                    valid_times[res["processing_indices"]],
                )
                processing_mask[t_idx, sid_idx] = True
            total_combinations_processed += res["combinations"]
            total_combinations_filtered += res["filtered"]

        # Assign aggregated values
        bin_index = np.searchsorted(new_epochs, valid_times)
        for i, t_idx in enumerate(bin_index):
            if 0 <= t_idx < len(new_epochs):
                vod_agg[t_idx, sid_idx] = np.nanmean(
                    [vod_agg[t_idx, sid_idx], sid_vod[i]]
                )
                vod_filtered[t_idx, sid_idx] = np.nanmean(
                    [vod_filtered[t_idx, sid_idx], sid_vod[i]]
                )

    total_time = time.time() - start_time

    # --- Build output dataset ---
    result_ds = xr.Dataset(
        data_vars={
            "VOD": (["epoch", "sid"], vod_agg),
            "VOD_filtered_hampel": (["epoch", "sid"], vod_filtered),
            "hampel_outlier_mask": (["epoch", "sid"], outlier_mask),
            "hampel_processing_mask": (["epoch", "sid"], processing_mask),
        },
        coords={"epoch": new_epochs, "sid": vod_ds["sid"].values},
        attrs=vod_ds.attrs.copy(),
    )

    # Copy geometry if present
    if "phi" in vod_ds and "theta" in vod_ds:
        phi_tmpl = vod_ds["phi"].isel(epoch=0).values
        theta_tmpl = vod_ds["theta"].isel(epoch=0).values
        result_ds["phi"] = (
            ["epoch", "sid"],
            np.repeat(phi_tmpl[None, :], len(new_epochs), axis=0),
        )
        result_ds["theta"] = (
            ["epoch", "sid"],
            np.repeat(theta_tmpl[None, :], len(new_epochs), axis=0),
        )

    # Reassign cell IDs consistently
    logger.info("Reassigning cell IDs for output dataset.")
    result_ds = add_cell_ids_to_vod_fast(result_ds, grid=grid, grid_name=grid_name)

    # Metadata
    result_ds.attrs.update(
        {
            "processing": "parallelized_hampel_with_optional_aggregation",
            "temporal_aggregation": temporal_agg or "none",
            "aggregation_method": agg_method,
            "threshold": threshold,
            "min_obs_per_sid": min_obs_per_sid,
            "spatial_batch_size": spatial_batch_size,
            "n_workers": n_workers,
            "execution_time_s": total_time,
            "combinations_processed": int(total_combinations_processed),
            "combinations_filtered": int(total_combinations_filtered),
        }
    )

    logger.info(
        "aggr_hampel complete: epochs=%d, sids=%d, time=%.1fs",
        len(new_epochs),
        n_sids,
        total_time,
    )

    return result_ds

Sigma-Clip Filtering

Vectorised numba-JIT Hampel filter with astropy sigma-clipping fallback.

Two complementary high-performance filtering strategies for gridded VOD data:

  • Vectorised + numba (astropy_hampel_vectorized_fast) – sliding-window Hampel filter compiled with numba.jit. Processes temporal chunks in cell batches; targets sub-5-minute runtimes for ~1.5 years of data.
  • Ultra-fast (astropy_hampel_ultra_fast) – pure-numpy vectorisation backed by astropy.stats.sigma_clip. Drops the per-window granularity in exchange for extreme throughput.

Functions

vectorized_sliding_window_hampel – numba-compiled core loop. process_cell_batch_vectorized – batch dispatcher for one temporal chunk. astropy_hampel_vectorized_fast – full pipeline (numba path). astropy_hampel_ultra_fast – full pipeline (astropy path).

Notes

  • vectorized_sliding_window_hampel uses numba.prange so the outer loop is distributed across all available cores automatically.
  • The 1.4826 MAD scaling factor matches the convention used by astropy.stats.mad_std.
  • Both top-level functions expect a cell_id_<grid_name> variable already present in the input dataset (see :func:canvod.grids.add_cell_ids_to_vod_fast).

vectorized_sliding_window_hampel(data, times, window_ns, sigma_threshold=3.0, min_points=5)

Sliding-window Hampel filter compiled with numba.

Each point is compared against the robust statistics (median, scaled MAD) of its temporal neighbourhood. Points whose deviation exceeds sigma_threshold × 1.4826 × MAD are flagged as outliers and set to NaN.

Parameters

data : np.ndarray 1-D array of values to filter. times : np.ndarray 1-D array of timestamps as int64 nanoseconds. window_ns : int Half-window size in nanoseconds. sigma_threshold : float, optional Number of scaled-MAD units for the outlier boundary. min_points : int, optional Minimum number of finite points required in the window to attempt filtering; points in smaller windows are left unchanged.

Returns

filtered_data : np.ndarray Copy of data with outliers replaced by NaN. outlier_mask : np.ndarray Boolean mask; True where an outlier was detected.

Source code in packages/canvod-grids/src/canvod/grids/analysis/sigma_clip_filter.py
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
@jit(nopython=True, parallel=True)
def vectorized_sliding_window_hampel(
    data: np.ndarray,
    times: np.ndarray,
    window_ns: int,
    sigma_threshold: float = 3.0,
    min_points: int = 5,
) -> tuple[np.ndarray, np.ndarray]:
    """Sliding-window Hampel filter compiled with numba.

    Each point is compared against the robust statistics (median,
    scaled MAD) of its temporal neighbourhood.  Points whose deviation
    exceeds *sigma_threshold* × 1.4826 × MAD are flagged as outliers
    and set to ``NaN``.

    Parameters
    ----------
    data : np.ndarray
        1-D array of values to filter.
    times : np.ndarray
        1-D array of timestamps as ``int64`` nanoseconds.
    window_ns : int
        Half-window size in nanoseconds.
    sigma_threshold : float, optional
        Number of scaled-MAD units for the outlier boundary.
    min_points : int, optional
        Minimum number of finite points required in the window to
        attempt filtering; points in smaller windows are left unchanged.

    Returns
    -------
    filtered_data : np.ndarray
        Copy of *data* with outliers replaced by ``NaN``.
    outlier_mask : np.ndarray
        Boolean mask; ``True`` where an outlier was detected.

    """
    n_points = len(data)
    filtered_data = data.copy()
    outlier_mask = np.zeros(n_points, dtype=np.bool_)

    for i in prange(n_points):
        if not np.isfinite(data[i]):
            continue

        current_time = times[i]
        window_start = current_time - window_ns
        window_end = current_time + window_ns

        # Points inside the temporal window with finite values
        window_indices = np.where(
            (times >= window_start) & (times <= window_end) & np.isfinite(data)
        )[0]

        if len(window_indices) < min_points:
            continue

        window_data = data[window_indices]

        median_val = np.median(window_data)
        mad_val = np.median(np.abs(window_data - median_val))

        if mad_val > 0:
            threshold_value = sigma_threshold * 1.4826 * mad_val
            if np.abs(data[i] - median_val) > threshold_value:
                outlier_mask[i] = True
                filtered_data[i] = np.nan

    return filtered_data, outlier_mask

process_cell_batch_vectorized(cell_batch, vod_chunk, times_chunk, cell_ids_chunk, window_hours, sigma_threshold, min_points)

Apply the numba Hampel filter to a subset of cells in one temporal chunk.

Parameters

cell_batch : np.ndarray 1-D array of cell IDs to process in this batch. vod_chunk : np.ndarray 2-D (epoch, sid) VOD values for the current temporal chunk. times_chunk : np.ndarray 1-D datetime64 timestamps for the chunk. cell_ids_chunk : np.ndarray 2-D (epoch, sid) cell-ID array matching vod_chunk. window_hours : float Half-window size in hours. sigma_threshold : float Outlier threshold in scaled-MAD units. min_points : int Minimum finite points required per window.

Returns

filtered_chunk : np.ndarray Filtered copy of vod_chunk. outlier_chunk : np.ndarray Boolean outlier mask with the same shape as vod_chunk.

Source code in packages/canvod-grids/src/canvod/grids/analysis/sigma_clip_filter.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
def process_cell_batch_vectorized(
    cell_batch: np.ndarray,
    vod_chunk: np.ndarray,
    times_chunk: np.ndarray,
    cell_ids_chunk: np.ndarray,
    window_hours: float,
    sigma_threshold: float,
    min_points: int,
) -> tuple[np.ndarray, np.ndarray]:
    """Apply the numba Hampel filter to a subset of cells in one temporal chunk.

    Parameters
    ----------
    cell_batch : np.ndarray
        1-D array of cell IDs to process in this batch.
    vod_chunk : np.ndarray
        2-D ``(epoch, sid)`` VOD values for the current temporal chunk.
    times_chunk : np.ndarray
        1-D ``datetime64`` timestamps for the chunk.
    cell_ids_chunk : np.ndarray
        2-D ``(epoch, sid)`` cell-ID array matching *vod_chunk*.
    window_hours : float
        Half-window size in hours.
    sigma_threshold : float
        Outlier threshold in scaled-MAD units.
    min_points : int
        Minimum finite points required per window.

    Returns
    -------
    filtered_chunk : np.ndarray
        Filtered copy of *vod_chunk*.
    outlier_chunk : np.ndarray
        Boolean outlier mask with the same shape as *vod_chunk*.

    """
    epochs, sids = vod_chunk.shape
    filtered_chunk = vod_chunk.copy()
    outlier_chunk = np.zeros((epochs, sids), dtype=bool)

    window_ns = int(window_hours * 3600 * 1e9)
    times_ns = times_chunk.astype("datetime64[ns]").astype(np.int64)

    for sid_idx in range(sids):
        sid_vod = vod_chunk[:, sid_idx]
        sid_cells = cell_ids_chunk[:, sid_idx]

        valid_mask = np.isfinite(sid_vod) & np.isfinite(sid_cells)
        if not np.any(valid_mask):
            continue

        for cell_id in cell_batch:
            cell_mask = valid_mask & (sid_cells == cell_id)
            if not np.any(cell_mask):
                continue

            cell_indices = np.where(cell_mask)[0]
            cell_vod = sid_vod[cell_indices]
            cell_times = times_ns[cell_indices]

            if len(cell_vod) < min_points:
                continue

            try:
                filtered_cell, outliers_cell = vectorized_sliding_window_hampel(
                    cell_vod, cell_times, window_ns, sigma_threshold, min_points
                )
                filtered_chunk[cell_indices, sid_idx] = filtered_cell
                outlier_chunk[cell_indices, sid_idx] = outliers_cell
            except Exception:
                # Keep original data if filtering fails for this cell-SID
                continue

    return filtered_chunk, outlier_chunk

astropy_hampel_vectorized_fast(vod_ds, grid_name='equal_area_2deg', window_hours=1.0, sigma_threshold=3.0, min_points=5, cell_batch_size=200, n_workers=None)

Numba-accelerated sliding-window Hampel filter over a VOD dataset.

Temporal chunks (as stored in the dask-backed dataset) are iterated sequentially; within each chunk the unique cells are split into batches of cell_batch_size and dispatched to :func:process_cell_batch_vectorized, which in turn calls the numba-compiled :func:vectorized_sliding_window_hampel.

Parameters

vod_ds : xr.Dataset VOD dataset containing a cell_id_<grid_name> variable. grid_name : str, optional Suffix used to locate the cell-ID variable (cell_id_<grid_name>). window_hours : float, optional Half-window size in hours for the sliding window. sigma_threshold : float, optional Outlier boundary in scaled-MAD units. min_points : int, optional Minimum finite points required in a window. cell_batch_size : int, optional Number of cells per batch (trades memory for cache locality). n_workers : int or None, optional Unused in this implementation; reserved for future parallel chunk processing.

Returns

result_ds : xr.Dataset Copy of vod_ds with two additional variables:

``VOD_filtered``
    Filtered VOD (outliers → NaN).
``hampel_outlier_mask``
    Boolean mask; ``True`` at outlier positions.

Raises

ValueError If the expected cell-ID variable is missing from vod_ds.

Source code in packages/canvod-grids/src/canvod/grids/analysis/sigma_clip_filter.py
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
def astropy_hampel_vectorized_fast(
    vod_ds: xr.Dataset,
    grid_name: str = "equal_area_2deg",
    window_hours: float = 1.0,
    sigma_threshold: float = 3.0,
    min_points: int = 5,
    cell_batch_size: int = 200,
    n_workers: int | None = None,
) -> xr.Dataset:
    """Numba-accelerated sliding-window Hampel filter over a VOD dataset.

    Temporal chunks (as stored in the dask-backed dataset) are iterated
    sequentially; within each chunk the unique cells are split into
    batches of *cell_batch_size* and dispatched to
    :func:`process_cell_batch_vectorized`, which in turn calls the
    numba-compiled :func:`vectorized_sliding_window_hampel`.

    Parameters
    ----------
    vod_ds : xr.Dataset
        VOD dataset containing a ``cell_id_<grid_name>`` variable.
    grid_name : str, optional
        Suffix used to locate the cell-ID variable
        (``cell_id_<grid_name>``).
    window_hours : float, optional
        Half-window size in hours for the sliding window.
    sigma_threshold : float, optional
        Outlier boundary in scaled-MAD units.
    min_points : int, optional
        Minimum finite points required in a window.
    cell_batch_size : int, optional
        Number of cells per batch (trades memory for cache locality).
    n_workers : int or None, optional
        Unused in this implementation; reserved for future parallel
        chunk processing.

    Returns
    -------
    result_ds : xr.Dataset
        Copy of *vod_ds* with two additional variables:

        ``VOD_filtered``
            Filtered VOD (outliers → NaN).
        ``hampel_outlier_mask``
            Boolean mask; ``True`` at outlier positions.

    Raises
    ------
    ValueError
        If the expected cell-ID variable is missing from *vod_ds*.

    """
    cell_id_var = f"cell_id_{grid_name}"
    if cell_id_var not in vod_ds:
        raise ValueError(
            f"Cell ID variable '{cell_id_var}' not found in dataset. "
            "Run add_cell_ids_to_vod_fast() first."
        )

    logger.info(
        "vectorized hampel: window=%.1fh, sigma=%.1f, batch_size=%d, shape=%s",
        window_hours,
        sigma_threshold,
        cell_batch_size,
        vod_ds.VOD.shape,
    )

    start_time = time.time()

    vod_data = vod_ds.VOD.data
    cell_ids = vod_ds[cell_id_var].data
    times = vod_ds.epoch.values

    logger.info("processing %d temporal chunks ...", vod_data.numblocks[0])

    def _process_temporal_chunk(chunk_idx: int) -> tuple[np.ndarray, np.ndarray]:
        """Process a single temporal chunk.

        Parameters
        ----------
        chunk_idx : int
            Chunk index along time axis.

        Returns
        -------
        tuple[np.ndarray, np.ndarray]
            Filtered values and outlier mask.

        """
        chunk_start = chunk_idx * vod_data.chunksize[0]
        chunk_end = min(chunk_start + vod_data.chunksize[0], vod_data.shape[0])

        vod_chunk = vod_data[chunk_start:chunk_end].compute()
        cell_chunk = cell_ids[chunk_start:chunk_end].compute()
        times_chunk = times[chunk_start:chunk_end]

        unique_cells = np.unique(cell_chunk[np.isfinite(cell_chunk)])
        if len(unique_cells) == 0:
            return vod_chunk, np.zeros_like(vod_chunk, dtype=bool)

        n_cell_batches = int(np.ceil(len(unique_cells) / cell_batch_size))

        filtered_chunk = vod_chunk.copy()
        outlier_chunk = np.zeros_like(vod_chunk, dtype=bool)

        for batch_idx in range(n_cell_batches):
            batch_start = batch_idx * cell_batch_size
            batch_end = min(batch_start + cell_batch_size, len(unique_cells))
            cell_batch = unique_cells[batch_start:batch_end]

            batch_filtered, batch_outliers = process_cell_batch_vectorized(
                cell_batch=cell_batch,
                vod_chunk=vod_chunk,
                times_chunk=times_chunk,
                cell_ids_chunk=cell_chunk,
                window_hours=window_hours,
                sigma_threshold=sigma_threshold,
                min_points=min_points,
            )

            for cell_id in cell_batch:
                cell_mask = cell_chunk == cell_id
                filtered_chunk[cell_mask] = batch_filtered[cell_mask]
                outlier_chunk[cell_mask] = batch_outliers[cell_mask]

        return filtered_chunk, outlier_chunk

    filtered_chunks: list[np.ndarray] = []
    outlier_chunks: list[np.ndarray] = []

    for chunk_idx in tqdm(range(vod_data.numblocks[0]), desc="Temporal chunks"):
        f_chunk, o_chunk = _process_temporal_chunk(chunk_idx)
        filtered_chunks.append(f_chunk)
        outlier_chunks.append(o_chunk)

    logger.info("assembling results ...")
    filtered_data = np.concatenate(filtered_chunks, axis=0)
    outlier_data = np.concatenate(outlier_chunks, axis=0)

    total_time = time.time() - start_time

    # Build result dataset
    result_ds = vod_ds.copy()
    result_ds["VOD_filtered"] = (("epoch", "sid"), filtered_data)
    result_ds["hampel_outlier_mask"] = (("epoch", "sid"), outlier_data)

    total_obs = int(np.isfinite(vod_ds.VOD.values).sum())
    outliers_detected = int(outlier_data.sum())
    outlier_pct = (outliers_detected / total_obs * 100) if total_obs > 0 else 0.0

    result_ds.attrs.update(
        {
            "hampel_method": "vectorized_astropy_equivalent",
            "hampel_window_hours": window_hours,
            "hampel_sigma_threshold": sigma_threshold,
            "hampel_min_points": min_points,
            "hampel_processing_time_s": total_time,
            "hampel_vectorized_optimized": True,
            "performance_target_achieved": total_time < 300,
        }
    )
    result_ds["VOD_filtered"].attrs.update(
        {
            "long_name": "VOD filtered with vectorized Hampel method",
            "method": "vectorized_sliding_window_with_numba_jit",
            "temporal_window_hours": window_hours,
            "outlier_threshold_sigma": sigma_threshold,
        }
    )

    logger.info(
        "done in %.1fs | obs=%d outliers=%d (%.2f%%) | target_met=%s",
        total_time,
        total_obs,
        outliers_detected,
        outlier_pct,
        total_time < 300,
    )

    return result_ds

astropy_hampel_ultra_fast(vod_ds, grid_name='equal_area_2deg', window_hours=1.0, sigma_threshold=3.0, min_points=5)

Pure-numpy sigma-clipping filter via astropy.stats.

Each (cell, SID) time series is clipped in one shot using :func:astropy.stats.sigma_clip with median centering and MAD scale estimation. No per-window temporal granularity is applied; this trades precision for throughput.

Parameters

vod_ds : xr.Dataset VOD dataset containing a cell_id_<grid_name> variable. grid_name : str, optional Suffix for the cell-ID variable. window_hours : float, optional Unused in ultra-fast mode; kept for API compatibility. sigma_threshold : float, optional Sigma-clipping threshold passed to astropy.stats.sigma_clip. min_points : int, optional Minimum finite observations required to attempt clipping.

Returns

result_ds : xr.Dataset Copy of vod_ds with VOD_filtered and hampel_outlier_mask variables added.

Raises

ValueError If the expected cell-ID variable is missing from vod_ds.

Source code in packages/canvod-grids/src/canvod/grids/analysis/sigma_clip_filter.py
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
def astropy_hampel_ultra_fast(
    vod_ds: xr.Dataset,
    grid_name: str = "equal_area_2deg",
    window_hours: float = 1.0,
    sigma_threshold: float = 3.0,
    min_points: int = 5,
) -> xr.Dataset:
    """Pure-numpy sigma-clipping filter via ``astropy.stats``.

    Each (cell, SID) time series is clipped in one shot using
    :func:`astropy.stats.sigma_clip` with median centering and MAD
    scale estimation.  No per-window temporal granularity is applied;
    this trades precision for throughput.

    Parameters
    ----------
    vod_ds : xr.Dataset
        VOD dataset containing a ``cell_id_<grid_name>`` variable.
    grid_name : str, optional
        Suffix for the cell-ID variable.
    window_hours : float, optional
        Unused in ultra-fast mode; kept for API compatibility.
    sigma_threshold : float, optional
        Sigma-clipping threshold passed to ``astropy.stats.sigma_clip``.
    min_points : int, optional
        Minimum finite observations required to attempt clipping.

    Returns
    -------
    result_ds : xr.Dataset
        Copy of *vod_ds* with ``VOD_filtered`` and
        ``hampel_outlier_mask`` variables added.

    Raises
    ------
    ValueError
        If the expected cell-ID variable is missing from *vod_ds*.

    """
    cell_id_var = f"cell_id_{grid_name}"
    if cell_id_var not in vod_ds:
        raise ValueError(
            f"Cell ID variable '{cell_id_var}' not found in dataset. "
            "Run add_cell_ids_to_vod_fast() first."
        )

    logger.info(
        "ultra-fast hampel: sigma=%.1f, min_points=%d, shape=%s",
        sigma_threshold,
        min_points,
        vod_ds.VOD.shape,
    )

    start_time = time.time()

    vod_values = vod_ds.VOD.values
    cell_ids = vod_ds[cell_id_var].values

    filtered_data = vod_values.copy()
    outlier_mask = np.zeros_like(vod_values, dtype=bool)

    unique_cells = np.unique(cell_ids[np.isfinite(cell_ids)])
    logger.info(
        "processing %d cells × %d SIDs ...", len(unique_cells), vod_values.shape[1]
    )

    for cell_id in tqdm(unique_cells, desc="Cells"):
        for sid_idx in range(vod_values.shape[1]):
            cell_mask = (cell_ids[:, sid_idx] == cell_id) & np.isfinite(
                vod_values[:, sid_idx]
            )
            if not np.any(cell_mask):
                continue

            cell_data = vod_values[cell_mask, sid_idx]
            if len(cell_data) < min_points:
                continue

            try:
                clipped = sigma_clip(
                    cell_data,
                    sigma=sigma_threshold,
                    cenfunc="median",
                    stdfunc=mad_std,
                    maxiters=1,
                    masked=True,
                )
                cell_indices = np.where(cell_mask)[0]
                filtered_data[cell_indices, sid_idx] = clipped.data
                outlier_mask[cell_indices, sid_idx] = clipped.mask
            except Exception:
                continue

    total_time = time.time() - start_time

    result_ds = vod_ds.copy()
    result_ds["VOD_filtered"] = (("epoch", "sid"), filtered_data)
    result_ds["hampel_outlier_mask"] = (("epoch", "sid"), outlier_mask)

    total_obs = int(np.isfinite(vod_values).sum())
    outliers = int(outlier_mask.sum())

    result_ds.attrs.update(
        {
            "hampel_method": "ultra_fast_astropy_sigma_clip",
            "hampel_sigma_threshold": sigma_threshold,
            "hampel_min_points": min_points,
            "hampel_processing_time_s": total_time,
        }
    )
    result_ds["VOD_filtered"].attrs.update(
        {
            "long_name": "VOD filtered with ultra-fast astropy sigma clipping",
            "method": "astropy_sigma_clip_per_cell_sid",
            "outlier_threshold_sigma": sigma_threshold,
        }
    )

    logger.info(
        "done in %.1fs | obs=%d outliers=%d (%.2f%%)",
        total_time,
        total_obs,
        outliers,
        (outliers / total_obs * 100) if total_obs > 0 else 0.0,
    )

    return result_ds

Masking

Spatial masking for hemispherical grid cells.

Provides tools to create boolean masks for selecting subsets of grid cells based on geometric constraints, data quality, or custom criteria.

Classes

SpatialMask – builder for boolean cell-selection masks.

Convenience functions

create_hemisphere_mask – north / south / east / west mask. create_elevation_mask – elevation-angle-based mask.

SpatialMask

Create spatial masks for grid cells.

Masks are boolean arrays where True = include cell, False = exclude cell. Multiple constraints can be combined with AND or OR logic.

Parameters

grid : GridData Grid instance.

Examples

mask = SpatialMask(grid) mask.add_phi_range(0, np.pi) # Northern hemisphere mask.add_theta_range(0, np.pi / 3) # Exclude low elevations mask.add_quality_threshold('mean_snr', min_value=40) spatial_mask = mask.compute() # Returns boolean array

Source code in packages/canvod-grids/src/canvod/grids/analysis/masking.py
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
class SpatialMask:
    """Create spatial masks for grid cells.

    Masks are boolean arrays where ``True`` = include cell,
    ``False`` = exclude cell.  Multiple constraints can be combined
    with AND or OR logic.

    Parameters
    ----------
    grid : GridData
        Grid instance.

    Examples
    --------
    >>> mask = SpatialMask(grid)
    >>> mask.add_phi_range(0, np.pi)          # Northern hemisphere
    >>> mask.add_theta_range(0, np.pi / 3)    # Exclude low elevations
    >>> mask.add_quality_threshold('mean_snr', min_value=40)
    >>> spatial_mask = mask.compute()          # Returns boolean array

    """

    def __init__(self, grid: GridData) -> None:
        """Initialize the spatial mask builder.

        Parameters
        ----------
        grid : GridData
            Grid instance.

        """
        self.grid = grid
        self.masks: list[tuple[str, np.ndarray]] = []
        self._grid_df = grid.grid

    # ------------------------------------------------------------------
    # Constraint builders
    # ------------------------------------------------------------------

    def add_phi_range(
        self, phi_min: float, phi_max: float, degrees: bool = False
    ) -> SpatialMask:
        """Add azimuth angle constraint.

        Parameters
        ----------
        phi_min : float
            Minimum azimuth angle.
        phi_max : float
            Maximum azimuth angle.
        degrees : bool
            If ``True``, angles are in degrees; otherwise radians.

        Returns
        -------
        SpatialMask
            Self for chaining.

        Notes
        -----
        Handles wraparound at 0°/360° correctly.
        Example: ``phi_range(350, 10)`` includes both 350°–360° and 0°–10°.

        """
        if degrees:
            phi_min = np.radians(phi_min)
            phi_max = np.radians(phi_max)

        phi = self._grid_df["phi"].to_numpy()

        # Normalise to [0, 2π)
        phi = np.mod(phi, 2 * np.pi)
        phi_min = np.mod(phi_min, 2 * np.pi)
        phi_max = np.mod(phi_max, 2 * np.pi)

        if phi_min <= phi_max:
            mask = (phi >= phi_min) & (phi <= phi_max)
        else:
            # Wraparound: e.g. 350° to 10°
            mask = (phi >= phi_min) | (phi <= phi_max)

        self.masks.append(("phi_range", mask))
        return self

    def add_theta_range(
        self, theta_min: float, theta_max: float, degrees: bool = False
    ) -> SpatialMask:
        """Add polar angle (zenith angle) constraint.

        Parameters
        ----------
        theta_min : float
            Minimum polar angle (0 = zenith).
        theta_max : float
            Maximum polar angle (π/2 = horizon).
        degrees : bool
            If ``True``, angles are in degrees; otherwise radians.

        Returns
        -------
        SpatialMask
            Self for chaining.

        Notes
        -----
        ``theta = 0°`` is zenith (straight up), ``theta = 90°`` is horizon.
        To exclude low elevations, use ``theta_max < 90°``.

        """
        if degrees:
            theta_min = np.radians(theta_min)
            theta_max = np.radians(theta_max)

        theta = self._grid_df["theta"].to_numpy()
        mask = (theta >= theta_min) & (theta <= theta_max)

        self.masks.append(("theta_range", mask))
        return self

    def add_elevation_range(
        self, elev_min: float, elev_max: float, degrees: bool = True
    ) -> SpatialMask:
        """Add elevation angle constraint (complementary to theta).

        Parameters
        ----------
        elev_min : float
            Minimum elevation angle (0 = horizon, 90 = zenith).
        elev_max : float
            Maximum elevation angle.
        degrees : bool
            If ``True``, angles are in degrees; otherwise radians.
            Default: ``True``.

        Returns
        -------
        SpatialMask
            Self for chaining.

        Notes
        -----
        ``elevation = 90° − theta``.  This is more intuitive for users
        who think in elevation angles.

        """
        if degrees:
            elev_min_rad = np.radians(elev_min)
            elev_max_rad = np.radians(elev_max)
        else:
            elev_min_rad = elev_min
            elev_max_rad = elev_max

        # Convert elevation → theta: theta = π/2 − elevation
        theta_max = np.pi / 2 - elev_min_rad
        theta_min = np.pi / 2 - elev_max_rad

        return self.add_theta_range(theta_min, theta_max, degrees=False)

    def add_cell_ids(self, cell_ids: list[int] | np.ndarray) -> SpatialMask:
        """Include specific cell IDs.

        Parameters
        ----------
        cell_ids : list[int] or np.ndarray
            Cell IDs to include.

        Returns
        -------
        SpatialMask
            Self for chaining.

        """
        mask = np.zeros(self.grid.ncells, dtype=bool)
        mask[cell_ids] = True

        self.masks.append(("cell_ids", mask))
        return self

    def add_exclude_cell_ids(self, cell_ids: list[int] | np.ndarray) -> SpatialMask:
        """Exclude specific cell IDs.

        Parameters
        ----------
        cell_ids : list[int] or np.ndarray
            Cell IDs to exclude.

        Returns
        -------
        SpatialMask
            Self for chaining.

        """
        mask = np.ones(self.grid.ncells, dtype=bool)
        mask[cell_ids] = False

        self.masks.append(("exclude_cell_ids", mask))
        return self

    def add_quality_threshold(
        self,
        var_name: str,
        min_value: float | None = None,
        max_value: float | None = None,
    ) -> SpatialMask:
        """Add data quality threshold based on grid cell properties.

        Parameters
        ----------
        var_name : str
            Variable name in grid (e.g. ``'mean_snr'``,
            ``'n_observations'``).
        min_value : float, optional
            Minimum allowed value.
        max_value : float, optional
            Maximum allowed value.

        Returns
        -------
        SpatialMask
            Self for chaining.

        Raises
        ------
        ValueError
            If *var_name* doesn't exist in the grid DataFrame.

        """
        if var_name not in self._grid_df.columns:
            available = list(self._grid_df.columns)
            raise ValueError(
                f"Variable '{var_name}' not found in grid. Available: {available}"
            )

        values = self._grid_df[var_name].to_numpy()
        mask = np.ones(self.grid.ncells, dtype=bool)

        if min_value is not None:
            mask = mask & (values >= min_value)
        if max_value is not None:
            mask = mask & (values <= max_value)

        self.masks.append((f"{var_name}_threshold", mask))
        return self

    def add_boundary_cells(self, exclude: bool = True) -> SpatialMask:
        """Include or exclude boundary cells.

        Parameters
        ----------
        exclude : bool
            If ``True``, exclude boundary cells; if ``False``, include
            only boundary cells.

        Returns
        -------
        SpatialMask
            Self for chaining.

        Raises
        ------
        ValueError
            If the grid does not have an ``'is_boundary'`` column.

        Notes
        -----
        Only works if the grid DataFrame contains an ``'is_boundary'``
        column.

        """
        if "is_boundary" not in self._grid_df.columns:
            raise ValueError("Grid does not have 'is_boundary' information")

        is_boundary = self._grid_df["is_boundary"].to_numpy()
        mask = ~is_boundary if exclude else is_boundary

        self.masks.append(("boundary_cells", mask))
        return self

    def add_custom_mask(
        self,
        mask: np.ndarray | Callable,
        name: str = "custom",
    ) -> SpatialMask:
        """Add custom mask or mask-generating callable.

        Parameters
        ----------
        mask : np.ndarray or callable
            * If array: boolean mask of shape ``(ncells,)``.
            * If callable: function ``(grid: GridData) -> np.ndarray``.
        name : str
            Label for this mask (used in summary).

        Returns
        -------
        SpatialMask
            Self for chaining.

        Raises
        ------
        ValueError
            If the resulting array has wrong shape or dtype.

        Examples
        --------
        >>> custom = np.array([True, False, True, ...])
        >>> mask.add_custom_mask(custom)

        >>> def high_snr_north(grid):
        ...     snr   = grid.grid['mean_snr'].to_numpy()
        ...     phi   = grid.grid['phi'].to_numpy()
        ...     return (snr > 40) & (phi < np.pi)
        >>> mask.add_custom_mask(high_snr_north)

        """
        if callable(mask):
            mask_array = mask(self.grid)
        else:
            mask_array = mask

        if not isinstance(mask_array, np.ndarray):
            raise ValueError("Custom mask must be numpy array or return numpy array")
        if mask_array.shape != (self.grid.ncells,):
            raise ValueError(
                f"Custom mask shape {mask_array.shape} doesn't match "
                f"grid size ({self.grid.ncells},)"
            )
        if mask_array.dtype != bool:
            raise ValueError("Custom mask must be boolean dtype")

        self.masks.append((name, mask_array))
        return self

    def add_radial_sector(
        self,
        center_phi: float,
        sector_width: float,
        theta_min: float = 0.0,
        theta_max: float = np.pi / 2,
        degrees: bool = True,
    ) -> SpatialMask:
        """Add radial sector (wedge) mask.

        Parameters
        ----------
        center_phi : float
            Centre azimuth of sector.
        sector_width : float
            Full angular width of sector.
        theta_min : float
            Minimum polar angle (radial inner bound).
        theta_max : float
            Maximum polar angle (radial outer bound).
        degrees : bool
            If ``True``, all angles in degrees; otherwise radians.

        Returns
        -------
        SpatialMask
            Self for chaining.

        Examples
        --------
        >>> # Northern sector, 30° wide, excluding low elevations
        >>> mask.add_radial_sector(center_phi=0, sector_width=30,
        ...                        theta_max=60, degrees=True)

        """
        if degrees:
            center_phi = np.radians(center_phi)
            sector_width = np.radians(sector_width)
            theta_min = np.radians(theta_min)
            theta_max = np.radians(theta_max)

        half_width = sector_width / 2
        self.add_phi_range(
            center_phi - half_width,
            center_phi + half_width,
            degrees=False,
        )
        self.add_theta_range(theta_min, theta_max, degrees=False)

        return self

    # ------------------------------------------------------------------
    # Computation & introspection
    # ------------------------------------------------------------------

    def compute(self, mode: str = "AND") -> np.ndarray:
        """Compute final combined boolean mask.

        Parameters
        ----------
        mode : str
            Combination mode:

            * ``'AND'`` – all constraints must be satisfied (intersection).
            * ``'OR'``  – at least one constraint must be satisfied (union).

        Returns
        -------
        np.ndarray
            Boolean array of shape ``(ncells,)`` where ``True`` = include.

        Raises
        ------
        ValueError
            If no masks have been added or *mode* is unknown.

        """
        if not self.masks:
            raise ValueError("No masks added. Use add_* methods before compute()")

        if mode.upper() == "AND":
            combined = np.ones(self.grid.ncells, dtype=bool)
            for _name, mask in self.masks:
                combined = combined & mask
        elif mode.upper() == "OR":
            combined = np.zeros(self.grid.ncells, dtype=bool)
            for _name, mask in self.masks:
                combined = combined | mask
        else:
            raise ValueError(f"Unknown mode: {mode}. Use 'AND' or 'OR'")

        return combined

    def get_mask_summary(self) -> dict:
        """Get summary of all added masks.

        Returns
        -------
        dict
            Dictionary with mask names and per-mask / combined cell counts.

        """
        summary: dict = {"total_cells": self.grid.ncells, "masks": []}

        for name, mask in self.masks:
            n_included = int(mask.sum())
            summary["masks"].append(
                {
                    "name": name,
                    "n_cells_included": n_included,
                    "fraction_included": n_included / self.grid.ncells,
                }
            )

        if self.masks:
            combined_and = self.compute(mode="AND")
            combined_or = self.compute(mode="OR")
            summary["combined"] = {
                "AND": {
                    "n_cells": int(combined_and.sum()),
                    "fraction": float(combined_and.sum() / self.grid.ncells),
                },
                "OR": {
                    "n_cells": int(combined_or.sum()),
                    "fraction": float(combined_or.sum() / self.grid.ncells),
                },
            }

        return summary

    def clear(self) -> SpatialMask:
        """Clear all masks.

        Returns
        -------
        SpatialMask
            Self for chaining.

        """
        self.masks = []
        return self

    def __repr__(self) -> str:
        """Return the developer-facing representation.

        Returns
        -------
        str
            Representation string.

        """
        mask_names = [name for name, _ in self.masks]
        return (
            f"SpatialMask(grid={self.grid.grid_type}, "
            f"n_masks={len(self.masks)}, masks={mask_names})"
        )

__init__(grid)

Initialize the spatial mask builder.

Parameters

grid : GridData Grid instance.

Source code in packages/canvod-grids/src/canvod/grids/analysis/masking.py
49
50
51
52
53
54
55
56
57
58
59
60
def __init__(self, grid: GridData) -> None:
    """Initialize the spatial mask builder.

    Parameters
    ----------
    grid : GridData
        Grid instance.

    """
    self.grid = grid
    self.masks: list[tuple[str, np.ndarray]] = []
    self._grid_df = grid.grid

add_phi_range(phi_min, phi_max, degrees=False)

Add azimuth angle constraint.

Parameters

phi_min : float Minimum azimuth angle. phi_max : float Maximum azimuth angle. degrees : bool If True, angles are in degrees; otherwise radians.

Returns

SpatialMask Self for chaining.

Notes

Handles wraparound at 0°/360° correctly. Example: phi_range(350, 10) includes both 350°–360° and 0°–10°.

Source code in packages/canvod-grids/src/canvod/grids/analysis/masking.py
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def add_phi_range(
    self, phi_min: float, phi_max: float, degrees: bool = False
) -> SpatialMask:
    """Add azimuth angle constraint.

    Parameters
    ----------
    phi_min : float
        Minimum azimuth angle.
    phi_max : float
        Maximum azimuth angle.
    degrees : bool
        If ``True``, angles are in degrees; otherwise radians.

    Returns
    -------
    SpatialMask
        Self for chaining.

    Notes
    -----
    Handles wraparound at 0°/360° correctly.
    Example: ``phi_range(350, 10)`` includes both 350°–360° and 0°–10°.

    """
    if degrees:
        phi_min = np.radians(phi_min)
        phi_max = np.radians(phi_max)

    phi = self._grid_df["phi"].to_numpy()

    # Normalise to [0, 2π)
    phi = np.mod(phi, 2 * np.pi)
    phi_min = np.mod(phi_min, 2 * np.pi)
    phi_max = np.mod(phi_max, 2 * np.pi)

    if phi_min <= phi_max:
        mask = (phi >= phi_min) & (phi <= phi_max)
    else:
        # Wraparound: e.g. 350° to 10°
        mask = (phi >= phi_min) | (phi <= phi_max)

    self.masks.append(("phi_range", mask))
    return self

add_theta_range(theta_min, theta_max, degrees=False)

Add polar angle (zenith angle) constraint.

Parameters

theta_min : float Minimum polar angle (0 = zenith). theta_max : float Maximum polar angle (π/2 = horizon). degrees : bool If True, angles are in degrees; otherwise radians.

Returns

SpatialMask Self for chaining.

Notes

theta = 0° is zenith (straight up), theta = 90° is horizon. To exclude low elevations, use theta_max < 90°.

Source code in packages/canvod-grids/src/canvod/grids/analysis/masking.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def add_theta_range(
    self, theta_min: float, theta_max: float, degrees: bool = False
) -> SpatialMask:
    """Add polar angle (zenith angle) constraint.

    Parameters
    ----------
    theta_min : float
        Minimum polar angle (0 = zenith).
    theta_max : float
        Maximum polar angle (π/2 = horizon).
    degrees : bool
        If ``True``, angles are in degrees; otherwise radians.

    Returns
    -------
    SpatialMask
        Self for chaining.

    Notes
    -----
    ``theta = 0°`` is zenith (straight up), ``theta = 90°`` is horizon.
    To exclude low elevations, use ``theta_max < 90°``.

    """
    if degrees:
        theta_min = np.radians(theta_min)
        theta_max = np.radians(theta_max)

    theta = self._grid_df["theta"].to_numpy()
    mask = (theta >= theta_min) & (theta <= theta_max)

    self.masks.append(("theta_range", mask))
    return self

add_elevation_range(elev_min, elev_max, degrees=True)

Add elevation angle constraint (complementary to theta).

Parameters

elev_min : float Minimum elevation angle (0 = horizon, 90 = zenith). elev_max : float Maximum elevation angle. degrees : bool If True, angles are in degrees; otherwise radians. Default: True.

Returns

SpatialMask Self for chaining.

Notes

elevation = 90° − theta. This is more intuitive for users who think in elevation angles.

Source code in packages/canvod-grids/src/canvod/grids/analysis/masking.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def add_elevation_range(
    self, elev_min: float, elev_max: float, degrees: bool = True
) -> SpatialMask:
    """Add elevation angle constraint (complementary to theta).

    Parameters
    ----------
    elev_min : float
        Minimum elevation angle (0 = horizon, 90 = zenith).
    elev_max : float
        Maximum elevation angle.
    degrees : bool
        If ``True``, angles are in degrees; otherwise radians.
        Default: ``True``.

    Returns
    -------
    SpatialMask
        Self for chaining.

    Notes
    -----
    ``elevation = 90° − theta``.  This is more intuitive for users
    who think in elevation angles.

    """
    if degrees:
        elev_min_rad = np.radians(elev_min)
        elev_max_rad = np.radians(elev_max)
    else:
        elev_min_rad = elev_min
        elev_max_rad = elev_max

    # Convert elevation → theta: theta = π/2 − elevation
    theta_max = np.pi / 2 - elev_min_rad
    theta_min = np.pi / 2 - elev_max_rad

    return self.add_theta_range(theta_min, theta_max, degrees=False)

add_cell_ids(cell_ids)

Include specific cell IDs.

Parameters

cell_ids : list[int] or np.ndarray Cell IDs to include.

Returns

SpatialMask Self for chaining.

Source code in packages/canvod-grids/src/canvod/grids/analysis/masking.py
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def add_cell_ids(self, cell_ids: list[int] | np.ndarray) -> SpatialMask:
    """Include specific cell IDs.

    Parameters
    ----------
    cell_ids : list[int] or np.ndarray
        Cell IDs to include.

    Returns
    -------
    SpatialMask
        Self for chaining.

    """
    mask = np.zeros(self.grid.ncells, dtype=bool)
    mask[cell_ids] = True

    self.masks.append(("cell_ids", mask))
    return self

add_exclude_cell_ids(cell_ids)

Exclude specific cell IDs.

Parameters

cell_ids : list[int] or np.ndarray Cell IDs to exclude.

Returns

SpatialMask Self for chaining.

Source code in packages/canvod-grids/src/canvod/grids/analysis/masking.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
def add_exclude_cell_ids(self, cell_ids: list[int] | np.ndarray) -> SpatialMask:
    """Exclude specific cell IDs.

    Parameters
    ----------
    cell_ids : list[int] or np.ndarray
        Cell IDs to exclude.

    Returns
    -------
    SpatialMask
        Self for chaining.

    """
    mask = np.ones(self.grid.ncells, dtype=bool)
    mask[cell_ids] = False

    self.masks.append(("exclude_cell_ids", mask))
    return self

add_quality_threshold(var_name, min_value=None, max_value=None)

Add data quality threshold based on grid cell properties.

Parameters

var_name : str Variable name in grid (e.g. 'mean_snr', 'n_observations'). min_value : float, optional Minimum allowed value. max_value : float, optional Maximum allowed value.

Returns

SpatialMask Self for chaining.

Raises

ValueError If var_name doesn't exist in the grid DataFrame.

Source code in packages/canvod-grids/src/canvod/grids/analysis/masking.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
def add_quality_threshold(
    self,
    var_name: str,
    min_value: float | None = None,
    max_value: float | None = None,
) -> SpatialMask:
    """Add data quality threshold based on grid cell properties.

    Parameters
    ----------
    var_name : str
        Variable name in grid (e.g. ``'mean_snr'``,
        ``'n_observations'``).
    min_value : float, optional
        Minimum allowed value.
    max_value : float, optional
        Maximum allowed value.

    Returns
    -------
    SpatialMask
        Self for chaining.

    Raises
    ------
    ValueError
        If *var_name* doesn't exist in the grid DataFrame.

    """
    if var_name not in self._grid_df.columns:
        available = list(self._grid_df.columns)
        raise ValueError(
            f"Variable '{var_name}' not found in grid. Available: {available}"
        )

    values = self._grid_df[var_name].to_numpy()
    mask = np.ones(self.grid.ncells, dtype=bool)

    if min_value is not None:
        mask = mask & (values >= min_value)
    if max_value is not None:
        mask = mask & (values <= max_value)

    self.masks.append((f"{var_name}_threshold", mask))
    return self

add_boundary_cells(exclude=True)

Include or exclude boundary cells.

Parameters

exclude : bool If True, exclude boundary cells; if False, include only boundary cells.

Returns

SpatialMask Self for chaining.

Raises

ValueError If the grid does not have an 'is_boundary' column.

Notes

Only works if the grid DataFrame contains an 'is_boundary' column.

Source code in packages/canvod-grids/src/canvod/grids/analysis/masking.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
def add_boundary_cells(self, exclude: bool = True) -> SpatialMask:
    """Include or exclude boundary cells.

    Parameters
    ----------
    exclude : bool
        If ``True``, exclude boundary cells; if ``False``, include
        only boundary cells.

    Returns
    -------
    SpatialMask
        Self for chaining.

    Raises
    ------
    ValueError
        If the grid does not have an ``'is_boundary'`` column.

    Notes
    -----
    Only works if the grid DataFrame contains an ``'is_boundary'``
    column.

    """
    if "is_boundary" not in self._grid_df.columns:
        raise ValueError("Grid does not have 'is_boundary' information")

    is_boundary = self._grid_df["is_boundary"].to_numpy()
    mask = ~is_boundary if exclude else is_boundary

    self.masks.append(("boundary_cells", mask))
    return self

add_custom_mask(mask, name='custom')

Add custom mask or mask-generating callable.

Parameters

mask : np.ndarray or callable * If array: boolean mask of shape (ncells,). * If callable: function (grid: GridData) -> np.ndarray. name : str Label for this mask (used in summary).

Returns

SpatialMask Self for chaining.

Raises

ValueError If the resulting array has wrong shape or dtype.

Examples

custom = np.array([True, False, True, ...]) mask.add_custom_mask(custom)

def high_snr_north(grid): ... snr = grid.grid['mean_snr'].to_numpy() ... phi = grid.grid['phi'].to_numpy() ... return (snr > 40) & (phi < np.pi) mask.add_custom_mask(high_snr_north)

Source code in packages/canvod-grids/src/canvod/grids/analysis/masking.py
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
def add_custom_mask(
    self,
    mask: np.ndarray | Callable,
    name: str = "custom",
) -> SpatialMask:
    """Add custom mask or mask-generating callable.

    Parameters
    ----------
    mask : np.ndarray or callable
        * If array: boolean mask of shape ``(ncells,)``.
        * If callable: function ``(grid: GridData) -> np.ndarray``.
    name : str
        Label for this mask (used in summary).

    Returns
    -------
    SpatialMask
        Self for chaining.

    Raises
    ------
    ValueError
        If the resulting array has wrong shape or dtype.

    Examples
    --------
    >>> custom = np.array([True, False, True, ...])
    >>> mask.add_custom_mask(custom)

    >>> def high_snr_north(grid):
    ...     snr   = grid.grid['mean_snr'].to_numpy()
    ...     phi   = grid.grid['phi'].to_numpy()
    ...     return (snr > 40) & (phi < np.pi)
    >>> mask.add_custom_mask(high_snr_north)

    """
    if callable(mask):
        mask_array = mask(self.grid)
    else:
        mask_array = mask

    if not isinstance(mask_array, np.ndarray):
        raise ValueError("Custom mask must be numpy array or return numpy array")
    if mask_array.shape != (self.grid.ncells,):
        raise ValueError(
            f"Custom mask shape {mask_array.shape} doesn't match "
            f"grid size ({self.grid.ncells},)"
        )
    if mask_array.dtype != bool:
        raise ValueError("Custom mask must be boolean dtype")

    self.masks.append((name, mask_array))
    return self

add_radial_sector(center_phi, sector_width, theta_min=0.0, theta_max=np.pi / 2, degrees=True)

Add radial sector (wedge) mask.

Parameters

center_phi : float Centre azimuth of sector. sector_width : float Full angular width of sector. theta_min : float Minimum polar angle (radial inner bound). theta_max : float Maximum polar angle (radial outer bound). degrees : bool If True, all angles in degrees; otherwise radians.

Returns

SpatialMask Self for chaining.

Examples

Northern sector, 30° wide, excluding low elevations

mask.add_radial_sector(center_phi=0, sector_width=30, ... theta_max=60, degrees=True)

Source code in packages/canvod-grids/src/canvod/grids/analysis/masking.py
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
def add_radial_sector(
    self,
    center_phi: float,
    sector_width: float,
    theta_min: float = 0.0,
    theta_max: float = np.pi / 2,
    degrees: bool = True,
) -> SpatialMask:
    """Add radial sector (wedge) mask.

    Parameters
    ----------
    center_phi : float
        Centre azimuth of sector.
    sector_width : float
        Full angular width of sector.
    theta_min : float
        Minimum polar angle (radial inner bound).
    theta_max : float
        Maximum polar angle (radial outer bound).
    degrees : bool
        If ``True``, all angles in degrees; otherwise radians.

    Returns
    -------
    SpatialMask
        Self for chaining.

    Examples
    --------
    >>> # Northern sector, 30° wide, excluding low elevations
    >>> mask.add_radial_sector(center_phi=0, sector_width=30,
    ...                        theta_max=60, degrees=True)

    """
    if degrees:
        center_phi = np.radians(center_phi)
        sector_width = np.radians(sector_width)
        theta_min = np.radians(theta_min)
        theta_max = np.radians(theta_max)

    half_width = sector_width / 2
    self.add_phi_range(
        center_phi - half_width,
        center_phi + half_width,
        degrees=False,
    )
    self.add_theta_range(theta_min, theta_max, degrees=False)

    return self

compute(mode='AND')

Compute final combined boolean mask.

Parameters

mode : str Combination mode:

* ``'AND'`` – all constraints must be satisfied (intersection).
* ``'OR'``  – at least one constraint must be satisfied (union).
Returns

np.ndarray Boolean array of shape (ncells,) where True = include.

Raises

ValueError If no masks have been added or mode is unknown.

Source code in packages/canvod-grids/src/canvod/grids/analysis/masking.py
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
def compute(self, mode: str = "AND") -> np.ndarray:
    """Compute final combined boolean mask.

    Parameters
    ----------
    mode : str
        Combination mode:

        * ``'AND'`` – all constraints must be satisfied (intersection).
        * ``'OR'``  – at least one constraint must be satisfied (union).

    Returns
    -------
    np.ndarray
        Boolean array of shape ``(ncells,)`` where ``True`` = include.

    Raises
    ------
    ValueError
        If no masks have been added or *mode* is unknown.

    """
    if not self.masks:
        raise ValueError("No masks added. Use add_* methods before compute()")

    if mode.upper() == "AND":
        combined = np.ones(self.grid.ncells, dtype=bool)
        for _name, mask in self.masks:
            combined = combined & mask
    elif mode.upper() == "OR":
        combined = np.zeros(self.grid.ncells, dtype=bool)
        for _name, mask in self.masks:
            combined = combined | mask
    else:
        raise ValueError(f"Unknown mode: {mode}. Use 'AND' or 'OR'")

    return combined

get_mask_summary()

Get summary of all added masks.

Returns

dict Dictionary with mask names and per-mask / combined cell counts.

Source code in packages/canvod-grids/src/canvod/grids/analysis/masking.py
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
def get_mask_summary(self) -> dict:
    """Get summary of all added masks.

    Returns
    -------
    dict
        Dictionary with mask names and per-mask / combined cell counts.

    """
    summary: dict = {"total_cells": self.grid.ncells, "masks": []}

    for name, mask in self.masks:
        n_included = int(mask.sum())
        summary["masks"].append(
            {
                "name": name,
                "n_cells_included": n_included,
                "fraction_included": n_included / self.grid.ncells,
            }
        )

    if self.masks:
        combined_and = self.compute(mode="AND")
        combined_or = self.compute(mode="OR")
        summary["combined"] = {
            "AND": {
                "n_cells": int(combined_and.sum()),
                "fraction": float(combined_and.sum() / self.grid.ncells),
            },
            "OR": {
                "n_cells": int(combined_or.sum()),
                "fraction": float(combined_or.sum() / self.grid.ncells),
            },
        }

    return summary

clear()

Clear all masks.

Returns

SpatialMask Self for chaining.

Source code in packages/canvod-grids/src/canvod/grids/analysis/masking.py
490
491
492
493
494
495
496
497
498
499
500
def clear(self) -> SpatialMask:
    """Clear all masks.

    Returns
    -------
    SpatialMask
        Self for chaining.

    """
    self.masks = []
    return self

__repr__()

Return the developer-facing representation.

Returns

str Representation string.

Source code in packages/canvod-grids/src/canvod/grids/analysis/masking.py
502
503
504
505
506
507
508
509
510
511
512
513
514
515
def __repr__(self) -> str:
    """Return the developer-facing representation.

    Returns
    -------
    str
        Representation string.

    """
    mask_names = [name for name, _ in self.masks]
    return (
        f"SpatialMask(grid={self.grid.grid_type}, "
        f"n_masks={len(self.masks)}, masks={mask_names})"
    )

create_hemisphere_mask(grid, hemisphere='north')

Create mask for a cardinal hemisphere.

Parameters

grid : GridData Grid instance. hemisphere : str One of 'north', 'south', 'east', 'west'.

Returns

np.ndarray Boolean mask.

Raises

ValueError If hemisphere is not recognised.

Source code in packages/canvod-grids/src/canvod/grids/analysis/masking.py
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
def create_hemisphere_mask(grid: GridData, hemisphere: str = "north") -> np.ndarray:
    """Create mask for a cardinal hemisphere.

    Parameters
    ----------
    grid : GridData
        Grid instance.
    hemisphere : str
        One of ``'north'``, ``'south'``, ``'east'``, ``'west'``.

    Returns
    -------
    np.ndarray
        Boolean mask.

    Raises
    ------
    ValueError
        If *hemisphere* is not recognised.

    """
    _RANGES = {
        "north": (315, 45),
        "south": (135, 225),
        "east": (45, 135),
        "west": (225, 315),
    }

    key = hemisphere.lower()
    if key not in _RANGES:
        raise ValueError(
            f"Unknown hemisphere: {hemisphere}. Choose from {list(_RANGES.keys())}"
        )

    mask = SpatialMask(grid)
    mask.add_phi_range(*_RANGES[key], degrees=True)
    return mask.compute()

create_elevation_mask(grid, min_elevation=30.0, max_elevation=90.0)

Create mask based on elevation angle.

Parameters

grid : GridData Grid instance. min_elevation : float Minimum elevation angle in degrees (default: 30°). max_elevation : float Maximum elevation angle in degrees (default: 90°).

Returns

np.ndarray Boolean mask.

Source code in packages/canvod-grids/src/canvod/grids/analysis/masking.py
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
def create_elevation_mask(
    grid: GridData, min_elevation: float = 30.0, max_elevation: float = 90.0
) -> np.ndarray:
    """Create mask based on elevation angle.

    Parameters
    ----------
    grid : GridData
        Grid instance.
    min_elevation : float
        Minimum elevation angle in degrees (default: 30°).
    max_elevation : float
        Maximum elevation angle in degrees (default: 90°).

    Returns
    -------
    np.ndarray
        Boolean mask.

    """
    mask = SpatialMask(grid)
    mask.add_elevation_range(min_elevation, max_elevation, degrees=True)
    return mask.compute()

Weighting

Weighting strategies for spatial aggregation of hemispherical grid data.

Provides tools to calculate and combine different weighting schemes for computing weighted means across grid cells. Critical for unbiased spatial statistics when cells have different sizes or data quality.

Classes

WeightCalculator – builder for combined spatial weights.

Convenience functions

compute_uniform_weights – equal weight per cell. compute_area_weights – solid-angle-only weights.

Notes

  • Supported weight types: solid_angle, observation_count, snr, sin_elevation, inverse_variance, custom.
  • Multiple weights are combined element-wise (multiply or add) and optionally normalised to sum to 1.
  • Dask-backed datasets are handled efficiently: only scalar statistics are computed eagerly; masks stay lazy.

WeightCalculator

Calculate and combine weights for spatial aggregation.

Parameters

grid : GridData Grid instance. ds : xr.Dataset or None Dataset with data variables (required for data-dependent weights such as observation_count, snr, inverse_variance).

Examples

weights = WeightCalculator(grid, vod_ds) weights.add_weight('solid_angle') weights.add_weight('observation_count', normalize=True) total_weights = weights.compute()

Source code in packages/canvod-grids/src/canvod/grids/analysis/weighting.py
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
class WeightCalculator:
    """Calculate and combine weights for spatial aggregation.

    Parameters
    ----------
    grid : GridData
        Grid instance.
    ds : xr.Dataset or None
        Dataset with data variables (required for data-dependent weights
        such as ``observation_count``, ``snr``, ``inverse_variance``).

    Examples
    --------
    >>> weights = WeightCalculator(grid, vod_ds)
    >>> weights.add_weight('solid_angle')
    >>> weights.add_weight('observation_count', normalize=True)
    >>> total_weights = weights.compute()

    """

    def __init__(self, grid: GridData, ds: xr.Dataset | None = None) -> None:
        """Initialize the weight calculator.

        Parameters
        ----------
        grid : GridData
            Grid instance.
        ds : xr.Dataset | None, optional
            Dataset with data variables for data-dependent weights.

        """
        self.grid = grid
        self.ds = ds
        self.weights: dict[str, np.ndarray] = {}
        self.weight_params: dict[str, dict] = {}
        self._grid_df = grid.grid

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def add_weight(
        self,
        weight_type: str,
        normalize: bool = True,
        **kwargs: Any,
    ) -> WeightCalculator:
        """Add a weight component.

        Parameters
        ----------
        weight_type : str
            One of ``'solid_angle'``, ``'observation_count'``,
            ``'snr'``, ``'sin_elevation'``, ``'inverse_variance'``,
            ``'custom'``.
        normalize : bool
            If ``True``, normalise this component to sum to 1.0 before
            combination.
        **kwargs
            Weight-specific parameters (see individual ``_compute_*``
            methods).

        Returns
        -------
        WeightCalculator
            Self for chaining.

        Examples
        --------
        >>> calc.add_weight('solid_angle')
        >>> calc.add_weight('observation_count', var_name='VOD', normalize=True)
        >>> calc.add_weight('custom', values=my_weights, normalize=False)

        """
        if weight_type in self.weights:
            logger.warning(f"Weight '{weight_type}' already exists, overwriting")

        _DISPATCH = {
            "solid_angle": self._compute_solid_angle_weight,
            "observation_count": self._compute_observation_count_weight,
            "snr": self._compute_snr_weight,
            "sin_elevation": self._compute_sin_elevation_weight,
            "inverse_variance": self._compute_inverse_variance_weight,
            "custom": self._compute_custom_weight,
        }

        if weight_type not in _DISPATCH:
            raise ValueError(
                f"Unknown weight_type: {weight_type}. Valid: {list(_DISPATCH.keys())}"
            )

        weight = _DISPATCH[weight_type](**kwargs)

        if normalize:
            weight = self._normalize_weights(weight)

        self.weights[weight_type] = weight
        self.weight_params[weight_type] = {"normalize": normalize, **kwargs}
        return self

    def compute(
        self,
        combination: Literal["multiply", "add"] = "multiply",
        normalize_final: bool = True,
    ) -> np.ndarray:
        """Compute final combined weights.

        Parameters
        ----------
        combination : str
            ``'multiply'`` – element-wise product (default).
            ``'add'``      – element-wise sum.
        normalize_final : bool
            If ``True``, normalise the final array to sum to 1.0.

        Returns
        -------
        np.ndarray
            Weight array of shape ``(ncells,)``.

        Raises
        ------
        ValueError
            If no weights have been added or *combination* is unknown.

        """
        if not self.weights:
            raise ValueError("No weights added. Use add_weight() before compute()")

        if combination == "multiply":
            combined = np.ones(self.grid.ncells)
            for w in self.weights.values():
                combined = combined * w
        elif combination == "add":
            combined = np.zeros(self.grid.ncells)
            for w in self.weights.values():
                combined = combined + w
        else:
            raise ValueError(f"Unknown combination: {combination}")

        if normalize_final:
            combined = self._normalize_weights(combined)

        n_nonzero = int(np.sum(combined > 0))
        nonzero_vals = combined[combined > 0]
        logger.info(
            f"Computed weights: {n_nonzero}/{self.grid.ncells} cells with "
            f"non-zero weight, min={nonzero_vals.min():.6f}, max={combined.max():.6f}"
        )
        return combined

    # ------------------------------------------------------------------
    # Introspection
    # ------------------------------------------------------------------

    def get_weight_summary(self) -> dict:
        """Summary statistics for each weight component.

        Returns
        -------
        dict
            Nested dict keyed by weight type.

        """
        summary: dict = {"components": {}}
        for wtype, weight in self.weights.items():
            nonzero = weight > 0
            summary["components"][wtype] = {
                "n_nonzero": int(nonzero.sum()),
                "fraction_nonzero": float(nonzero.sum() / self.grid.ncells),
                "min": float(weight[nonzero].min()) if nonzero.any() else 0.0,
                "max": float(weight.max()),
                "mean": float(weight[nonzero].mean()) if nonzero.any() else 0.0,
                "params": self.weight_params[wtype],
            }
        return summary

    def remove_weight(self, weight_type: str) -> WeightCalculator:
        """Remove a weight component.

        Parameters
        ----------
        weight_type : str
            Weight type to remove.

        Returns
        -------
        WeightCalculator
            Self for chaining.

        """
        if weight_type in self.weights:
            del self.weights[weight_type]
            del self.weight_params[weight_type]
            logger.debug(f"Removed weight: {weight_type}")
        else:
            logger.warning(f"Weight '{weight_type}' not found")
        return self

    def clear(self) -> WeightCalculator:
        """Clear all weights.

        Returns
        -------
        WeightCalculator
            Self for chaining.

        """
        self.weights = {}
        self.weight_params = {}
        return self

    def __repr__(self) -> str:
        """Return the developer-facing representation.

        Returns
        -------
        str
            Representation string.

        """
        return (
            f"WeightCalculator(grid={self.grid.grid_type}, "
            f"weights={list(self.weights.keys())})"
        )

    # ------------------------------------------------------------------
    # Weight computation (private)
    # ------------------------------------------------------------------

    def _compute_solid_angle_weight(self, **kwargs: Any) -> np.ndarray:
        """Weights based on cell solid angles (geometric fairness)."""
        if "solid_angle" in self._grid_df.columns:
            solid_angles = self._grid_df["solid_angle"].to_numpy()
        else:
            logger.debug("Computing solid angles from grid geometry")
            solid_angles = self._compute_solid_angles_from_geometry()

        if np.any(solid_angles <= 0):
            logger.warning("Found non-positive solid angles, setting to small value")
            solid_angles = np.maximum(solid_angles, 1e-10)

        return solid_angles

    def _compute_solid_angles_from_geometry(self) -> np.ndarray:
        """Compute solid angles for each cell from grid geometry.

        Returns
        -------
        np.ndarray
            Solid angles in steradians.

        Notes
        -----
        The sum of solid angles should equal the hemisphere area
        (2π steradians) for a complete hemisphere.

        """
        grid_type = self.grid.grid_type

        if grid_type in ("equal_area", "equal_angle", "equirectangular"):
            return self._compute_rectangular_solid_angles()
        if grid_type == "htm":
            return self._compute_htm_solid_angles()
        if grid_type == "geodesic":
            return self._compute_geodesic_solid_angles()
        if grid_type in ("healpix", "fibonacci"):
            # Both are (approximately) equal-area
            theta = self._grid_df["theta"].to_numpy()
            hemisphere_cells = int(np.sum(theta <= np.pi / 2))
            if hemisphere_cells > 0:
                cell_area = (2 * np.pi) / hemisphere_cells
                return np.full(self.grid.ncells, cell_area)
            return np.zeros(self.grid.ncells)
        logger.warning(f"Unknown grid type: {grid_type}, using uniform weights")
        return np.full(self.grid.ncells, (2 * np.pi) / self.grid.ncells)

    def _compute_rectangular_solid_angles(self) -> np.ndarray:
        """Solid angle = Δφ × [cos(θ_min) − cos(θ_max)]."""
        phi_min = self._grid_df["phi_min"].to_numpy()
        phi_max = self._grid_df["phi_max"].to_numpy()
        theta_min = self._grid_df["theta_min"].to_numpy()
        theta_max = self._grid_df["theta_max"].to_numpy()

        delta_phi = phi_max - phi_min
        return delta_phi * (np.cos(theta_min) - np.cos(theta_max))

    def _compute_htm_solid_angles(self) -> np.ndarray:
        """Solid angles for HTM triangular cells via L'Huilier's theorem.

        For each spherical triangle with vertices on the unit sphere the
        solid angle is computed as:

            Ω = 4 arctan √(tan(s/2) tan((s−a)/2) tan((s−b)/2) tan((s−c)/2))

        where *a*, *b*, *c* are arc-lengths between vertex pairs and
        *s* = (a+b+c)/2.
        """
        solid_angles = np.zeros(self.grid.ncells)

        for i, row in enumerate(self._grid_df.iter_rows(named=True)):
            try:
                v0 = np.array(row["htm_vertex_0"], dtype=float)
                v1 = np.array(row["htm_vertex_1"], dtype=float)
                v2 = np.array(row["htm_vertex_2"], dtype=float)

                # Skip cells significantly below the horizon
                if v0[2] < -0.01 or v1[2] < -0.01 or v2[2] < -0.01:
                    continue

                # Normalise to unit sphere
                v0 = v0 / np.linalg.norm(v0)
                v1 = v1 / np.linalg.norm(v1)
                v2 = v2 / np.linalg.norm(v2)

                # Arc lengths between vertex pairs
                a = np.arccos(np.clip(np.dot(v1, v2), -1, 1))
                b = np.arccos(np.clip(np.dot(v0, v2), -1, 1))
                c = np.arccos(np.clip(np.dot(v0, v1), -1, 1))

                s = (a + b + c) / 2  # semi-perimeter

                product = (
                    np.tan(s / 2)
                    * np.tan((s - a) / 2)
                    * np.tan((s - b) / 2)
                    * np.tan((s - c) / 2)
                )

                solid_angles[i] = (
                    4 * np.arctan(np.sqrt(product)) if product > 0 else 0.0
                )

            except Exception as e:
                logger.warning(f"Error computing HTM solid angle {i}: {e}")

        return solid_angles

    def _compute_geodesic_solid_angles(self) -> np.ndarray:
        """Solid angles for geodesic cells from vertex data."""
        import polars as pl

        solid_angles = np.zeros(self.grid.ncells)

        if hasattr(self.grid, "vertices") and self.grid.vertices is not None:
            vertices_df = self.grid.vertices

            for cell_id in range(self.grid.ncells):
                try:
                    cell_verts = vertices_df.filter(pl.col("cell_id") == cell_id).sort(
                        "vertex_idx"
                    )
                    if len(cell_verts) < 3:
                        continue

                    x = cell_verts["x"].to_numpy()[:3]
                    y = cell_verts["y"].to_numpy()[:3]
                    z = cell_verts["z"].to_numpy()[:3]

                    v0 = np.array([x[0], y[0], z[0]])
                    v1 = np.array([x[1], y[1], z[1]])
                    v2 = np.array([x[2], y[2], z[2]])

                    v0 = v0 / np.linalg.norm(v0)
                    v1 = v1 / np.linalg.norm(v1)
                    v2 = v2 / np.linalg.norm(v2)

                    numerator = np.abs(np.dot(v0, np.cross(v1, v2)))
                    denominator = 1 + np.dot(v0, v1) + np.dot(v1, v2) + np.dot(v2, v0)

                    solid_angles[cell_id] = 2 * np.arctan2(
                        numerator,
                        denominator,
                    )

                except Exception as e:
                    logger.warning(
                        f"Error computing geodesic solid angle {cell_id}: {e}"
                    )
        else:
            # Approximate: equal area
            solid_angles = np.full(
                self.grid.ncells,
                (2 * np.pi) / self.grid.ncells,
            )

        return solid_angles

    def _compute_observation_count_weight(
        self,
        var_name: str = "VOD",
        cell_id_var: str | None = None,
        min_count: int = 1,
        **kwargs: Any,
    ) -> np.ndarray:
        """Weights based on number of observations per cell.

        Parameters
        ----------
        var_name : str
            Variable to count observations for.
        cell_id_var : str, optional
            Cell-ID variable name (auto-detected if ``None``).
        min_count : int
            Cells with fewer observations get weight 0.

        """
        if self.ds is None:
            raise ValueError("Dataset required for observation_count weights.")

        cell_id_var = self._resolve_cell_id_var(cell_id_var)

        if var_name not in self.ds:
            raise ValueError(f"Variable '{var_name}' not found in dataset")

        logger.info("Computing observation counts (this may take 30-60 seconds)...")

        cell_ids_da = self.ds[cell_id_var].data.ravel()
        var_values_da = self.ds[var_name].data.ravel()

        valid_mask = da.isfinite(cell_ids_da) & da.isfinite(var_values_da)
        valid_cell_ids = da.where(valid_mask, cell_ids_da, -1).astype(np.int32)

        counts, _ = da.histogram(
            valid_cell_ids[valid_cell_ids >= 0],
            bins=np.arange(-0.5, self.grid.ncells + 0.5),
        )
        counts = counts.compute()

        weights = counts.astype(float)
        weights[counts < min_count] = 0.0

        logger.info(
            f"Observation counts: min={counts.min()}, max={counts.max()}, "
            f"mean={counts[counts > 0].mean():.1f}, "
            f"cells_with_data={np.sum(counts > 0)}"
        )
        return weights

    def _compute_snr_weight(
        self,
        snr_var: str = "SNR",
        cell_id_var: str | None = None,
        aggregation: Literal["mean", "median", "max"] = "mean",
        **kwargs: Any,
    ) -> np.ndarray:
        """Weights based on signal-to-noise ratio.

        Parameters
        ----------
        snr_var : str
            SNR variable name in the dataset.
        cell_id_var : str, optional
            Cell-ID variable name.
        aggregation : str
            Per-cell aggregation: ``'mean'``, ``'median'``, or ``'max'``.

        """
        if self.ds is None:
            raise ValueError("Dataset required for SNR weights")

        # Use pre-aggregated column if available
        if "mean_snr" in self._grid_df.columns:
            logger.info("Using pre-computed mean_snr from grid")
            return self._grid_df["mean_snr"].to_numpy()

        if snr_var not in self.ds:
            raise ValueError(f"SNR variable '{snr_var}' not found in dataset")

        cell_id_var = self._resolve_cell_id_var(cell_id_var)

        cell_ids = self.ds[cell_id_var].values.ravel()
        snr_values = self.ds[snr_var].values.ravel()

        valid = np.isfinite(cell_ids) & np.isfinite(snr_values)
        cell_ids_valid = cell_ids[valid].astype(int)
        snr_valid = snr_values[valid]

        _AGG = {"mean": np.mean, "median": np.median, "max": np.max}
        if aggregation not in _AGG:
            raise ValueError(f"Unknown aggregation: {aggregation}")
        agg_fn = _AGG[aggregation]

        snr_per_cell = np.zeros(self.grid.ncells)
        for cell_id in range(self.grid.ncells):
            cell_mask = cell_ids_valid == cell_id
            if np.any(cell_mask):
                snr_per_cell[cell_id] = agg_fn(snr_valid[cell_mask])

        return snr_per_cell

    def _compute_sin_elevation_weight(self, **kwargs: Any) -> np.ndarray:
        """Geometric correction: weight = sin(elevation) = cos(θ).

        Higher elevation → shorter atmospheric path → higher weight.
        """
        theta = self._grid_df["theta"].to_numpy()
        return np.maximum(np.cos(theta), 0.0)

    def _compute_inverse_variance_weight(
        self,
        var_name: str = "VOD",
        cell_id_var: str | None = None,
        min_observations: int = 2,
        regularization: float = 0.0,
        **kwargs: Any,
    ) -> np.ndarray:
        """Precision weighting: weight = 1 / (variance + regularization).

        Parameters
        ----------
        var_name : str
            Variable to compute per-cell variance for.
        cell_id_var : str, optional
            Cell-ID variable name.
        min_observations : int
            Minimum observations to compute variance.
        regularization : float
            Added to variance to avoid division by zero.

        """
        if self.ds is None:
            raise ValueError("Dataset required for inverse_variance weights")
        if var_name not in self.ds:
            raise ValueError(f"Variable '{var_name}' not found in dataset")

        cell_id_var = self._resolve_cell_id_var(cell_id_var)

        cell_ids = self.ds[cell_id_var].values.ravel()
        var_values = self.ds[var_name].values.ravel()

        valid = np.isfinite(cell_ids) & np.isfinite(var_values)
        cell_ids_valid = cell_ids[valid].astype(int)
        var_valid = var_values[valid]

        variances = np.full(self.grid.ncells, np.nan)
        for cell_id in range(self.grid.ncells):
            cell_mask = cell_ids_valid == cell_id
            if np.sum(cell_mask) >= min_observations:
                variances[cell_id] = np.var(var_valid[cell_mask], ddof=1)

        weights = np.zeros(self.grid.ncells)
        valid_var = np.isfinite(variances) & (variances > 0)
        if np.any(valid_var):
            weights[valid_var] = 1.0 / (variances[valid_var] + regularization)

        return weights

    def _compute_custom_weight(
        self,
        values: np.ndarray,
        **kwargs: Any,
    ) -> np.ndarray:
        """User-provided weight array.

        Parameters
        ----------
        values : np.ndarray
            Must have shape ``(ncells,)``.

        """
        if not isinstance(values, np.ndarray):
            raise TypeError("Custom weights must be numpy array")
        if values.shape != (self.grid.ncells,):
            raise ValueError(
                f"Custom weights shape {values.shape} doesn't match "
                f"grid size ({self.grid.ncells},)"
            )
        return values.copy()

    # ------------------------------------------------------------------
    # Helpers
    # ------------------------------------------------------------------

    def _resolve_cell_id_var(self, cell_id_var: str | None) -> str:
        """Auto-detect cell-ID variable if not specified.

        Parameters
        ----------
        cell_id_var : str or None
            Explicit name, or ``None`` for auto-detection.

        Returns
        -------
        str
            Resolved variable name.

        Raises
        ------
        ValueError
            If no ``cell_id_*`` variable is found.

        """
        if cell_id_var is not None:
            return cell_id_var

        candidate = f"cell_id_{self.grid.grid_type}"
        if candidate in self.ds:
            return candidate

        cell_vars = [v for v in self.ds.data_vars if v.startswith("cell_id_")]
        if cell_vars:
            logger.info(f"Auto-detected cell_id variable: {cell_vars[0]}")
            return cell_vars[0]

        raise ValueError("No cell_id variable found in dataset.")

    @staticmethod
    def _normalize_weights(weights: np.ndarray) -> np.ndarray:
        """Normalise weights to sum to 1.0, handling NaN and zeros."""
        weights = np.nan_to_num(weights, nan=0.0)
        weight_sum = weights.sum()
        if weight_sum > 0:
            return weights / weight_sum
        logger.warning("All weights are zero, returning uniform weights")
        return np.ones_like(weights) / len(weights)

__init__(grid, ds=None)

Initialize the weight calculator.

Parameters

grid : GridData Grid instance. ds : xr.Dataset | None, optional Dataset with data variables for data-dependent weights.

Source code in packages/canvod-grids/src/canvod/grids/analysis/weighting.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def __init__(self, grid: GridData, ds: xr.Dataset | None = None) -> None:
    """Initialize the weight calculator.

    Parameters
    ----------
    grid : GridData
        Grid instance.
    ds : xr.Dataset | None, optional
        Dataset with data variables for data-dependent weights.

    """
    self.grid = grid
    self.ds = ds
    self.weights: dict[str, np.ndarray] = {}
    self.weight_params: dict[str, dict] = {}
    self._grid_df = grid.grid

add_weight(weight_type, normalize=True, **kwargs)

Add a weight component.

Parameters

weight_type : str One of 'solid_angle', 'observation_count', 'snr', 'sin_elevation', 'inverse_variance', 'custom'. normalize : bool If True, normalise this component to sum to 1.0 before combination. **kwargs Weight-specific parameters (see individual _compute_* methods).

Returns

WeightCalculator Self for chaining.

Examples

calc.add_weight('solid_angle') calc.add_weight('observation_count', var_name='VOD', normalize=True) calc.add_weight('custom', values=my_weights, normalize=False)

Source code in packages/canvod-grids/src/canvod/grids/analysis/weighting.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def add_weight(
    self,
    weight_type: str,
    normalize: bool = True,
    **kwargs: Any,
) -> WeightCalculator:
    """Add a weight component.

    Parameters
    ----------
    weight_type : str
        One of ``'solid_angle'``, ``'observation_count'``,
        ``'snr'``, ``'sin_elevation'``, ``'inverse_variance'``,
        ``'custom'``.
    normalize : bool
        If ``True``, normalise this component to sum to 1.0 before
        combination.
    **kwargs
        Weight-specific parameters (see individual ``_compute_*``
        methods).

    Returns
    -------
    WeightCalculator
        Self for chaining.

    Examples
    --------
    >>> calc.add_weight('solid_angle')
    >>> calc.add_weight('observation_count', var_name='VOD', normalize=True)
    >>> calc.add_weight('custom', values=my_weights, normalize=False)

    """
    if weight_type in self.weights:
        logger.warning(f"Weight '{weight_type}' already exists, overwriting")

    _DISPATCH = {
        "solid_angle": self._compute_solid_angle_weight,
        "observation_count": self._compute_observation_count_weight,
        "snr": self._compute_snr_weight,
        "sin_elevation": self._compute_sin_elevation_weight,
        "inverse_variance": self._compute_inverse_variance_weight,
        "custom": self._compute_custom_weight,
    }

    if weight_type not in _DISPATCH:
        raise ValueError(
            f"Unknown weight_type: {weight_type}. Valid: {list(_DISPATCH.keys())}"
        )

    weight = _DISPATCH[weight_type](**kwargs)

    if normalize:
        weight = self._normalize_weights(weight)

    self.weights[weight_type] = weight
    self.weight_params[weight_type] = {"normalize": normalize, **kwargs}
    return self

compute(combination='multiply', normalize_final=True)

Compute final combined weights.

Parameters

combination : str 'multiply' – element-wise product (default). 'add' – element-wise sum. normalize_final : bool If True, normalise the final array to sum to 1.0.

Returns

np.ndarray Weight array of shape (ncells,).

Raises

ValueError If no weights have been added or combination is unknown.

Source code in packages/canvod-grids/src/canvod/grids/analysis/weighting.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def compute(
    self,
    combination: Literal["multiply", "add"] = "multiply",
    normalize_final: bool = True,
) -> np.ndarray:
    """Compute final combined weights.

    Parameters
    ----------
    combination : str
        ``'multiply'`` – element-wise product (default).
        ``'add'``      – element-wise sum.
    normalize_final : bool
        If ``True``, normalise the final array to sum to 1.0.

    Returns
    -------
    np.ndarray
        Weight array of shape ``(ncells,)``.

    Raises
    ------
    ValueError
        If no weights have been added or *combination* is unknown.

    """
    if not self.weights:
        raise ValueError("No weights added. Use add_weight() before compute()")

    if combination == "multiply":
        combined = np.ones(self.grid.ncells)
        for w in self.weights.values():
            combined = combined * w
    elif combination == "add":
        combined = np.zeros(self.grid.ncells)
        for w in self.weights.values():
            combined = combined + w
    else:
        raise ValueError(f"Unknown combination: {combination}")

    if normalize_final:
        combined = self._normalize_weights(combined)

    n_nonzero = int(np.sum(combined > 0))
    nonzero_vals = combined[combined > 0]
    logger.info(
        f"Computed weights: {n_nonzero}/{self.grid.ncells} cells with "
        f"non-zero weight, min={nonzero_vals.min():.6f}, max={combined.max():.6f}"
    )
    return combined

get_weight_summary()

Summary statistics for each weight component.

Returns

dict Nested dict keyed by weight type.

Source code in packages/canvod-grids/src/canvod/grids/analysis/weighting.py
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
def get_weight_summary(self) -> dict:
    """Summary statistics for each weight component.

    Returns
    -------
    dict
        Nested dict keyed by weight type.

    """
    summary: dict = {"components": {}}
    for wtype, weight in self.weights.items():
        nonzero = weight > 0
        summary["components"][wtype] = {
            "n_nonzero": int(nonzero.sum()),
            "fraction_nonzero": float(nonzero.sum() / self.grid.ncells),
            "min": float(weight[nonzero].min()) if nonzero.any() else 0.0,
            "max": float(weight.max()),
            "mean": float(weight[nonzero].mean()) if nonzero.any() else 0.0,
            "params": self.weight_params[wtype],
        }
    return summary

remove_weight(weight_type)

Remove a weight component.

Parameters

weight_type : str Weight type to remove.

Returns

WeightCalculator Self for chaining.

Source code in packages/canvod-grids/src/canvod/grids/analysis/weighting.py
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
def remove_weight(self, weight_type: str) -> WeightCalculator:
    """Remove a weight component.

    Parameters
    ----------
    weight_type : str
        Weight type to remove.

    Returns
    -------
    WeightCalculator
        Self for chaining.

    """
    if weight_type in self.weights:
        del self.weights[weight_type]
        del self.weight_params[weight_type]
        logger.debug(f"Removed weight: {weight_type}")
    else:
        logger.warning(f"Weight '{weight_type}' not found")
    return self

clear()

Clear all weights.

Returns

WeightCalculator Self for chaining.

Source code in packages/canvod-grids/src/canvod/grids/analysis/weighting.py
241
242
243
244
245
246
247
248
249
250
251
252
def clear(self) -> WeightCalculator:
    """Clear all weights.

    Returns
    -------
    WeightCalculator
        Self for chaining.

    """
    self.weights = {}
    self.weight_params = {}
    return self

__repr__()

Return the developer-facing representation.

Returns

str Representation string.

Source code in packages/canvod-grids/src/canvod/grids/analysis/weighting.py
254
255
256
257
258
259
260
261
262
263
264
265
266
def __repr__(self) -> str:
    """Return the developer-facing representation.

    Returns
    -------
    str
        Representation string.

    """
    return (
        f"WeightCalculator(grid={self.grid.grid_type}, "
        f"weights={list(self.weights.keys())})"
    )

compute_uniform_weights(grid)

Uniform weights (all cells equal).

Parameters

grid : GridData Grid instance.

Returns

np.ndarray Array of 1 / ncells for each cell.

Source code in packages/canvod-grids/src/canvod/grids/analysis/weighting.py
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
def compute_uniform_weights(grid: GridData) -> np.ndarray:
    """Uniform weights (all cells equal).

    Parameters
    ----------
    grid : GridData
        Grid instance.

    Returns
    -------
    np.ndarray
        Array of ``1 / ncells`` for each cell.

    """
    return np.ones(grid.ncells) / grid.ncells

compute_area_weights(grid, normalize=True)

Weights based on cell solid angles only.

Parameters

grid : GridData Grid instance. normalize : bool If True, normalise to sum to 1.0.

Returns

np.ndarray Area-based weights.

Source code in packages/canvod-grids/src/canvod/grids/analysis/weighting.py
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
def compute_area_weights(grid: GridData, normalize: bool = True) -> np.ndarray:
    """Weights based on cell solid angles only.

    Parameters
    ----------
    grid : GridData
        Grid instance.
    normalize : bool
        If ``True``, normalise to sum to 1.0.

    Returns
    -------
    np.ndarray
        Area-based weights.

    """
    calc = WeightCalculator(grid)
    calc.add_weight("solid_angle", normalize=normalize)
    return calc.compute(normalize_final=normalize)

Solar Geometry

Solar position calculations and corrections for VOD data.

Provides tools to compute solar positions and apply corrections to account for solar radiation effects on vegetation optical depth measurements.

Classes

SolarPositionCalculator – solar zenith / azimuth and VOD correction.

Convenience functions

compute_solar_zenith – quick zenith-only computation. filter_daytime_data – mask nighttime observations.

Notes

  • When pvlib is installed it is preferred for high-accuracy calculations. The built-in fallback uses NOAA algorithms (accuracy ~0.01° for 1800–2100).
  • All public methods accept either np.ndarray of datetime64 or pd.DatetimeIndex.

SolarPositionCalculator

Calculate solar positions for VOD corrections.

Parameters

lat : float Observer latitude in degrees (positive = North). lon : float Observer longitude in degrees (positive = East). elevation : float Elevation above sea level in metres (default: 0). use_pvlib : bool If True, use pvlib when available (more accurate). Falls back to built-in formulas automatically.

Examples

calc = SolarPositionCalculator(lat=40.0, lon=-105.0, elevation=1655) times = pd.date_range('2025-01-01', periods=24, freq='1H') zenith, azimuth = calc.compute_solar_position(times) corrected = calc.apply_solar_correction(vod_data, times)

Source code in packages/canvod-grids/src/canvod/grids/analysis/solar.py
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
class SolarPositionCalculator:
    """Calculate solar positions for VOD corrections.

    Parameters
    ----------
    lat : float
        Observer latitude in degrees (positive = North).
    lon : float
        Observer longitude in degrees (positive = East).
    elevation : float
        Elevation above sea level in metres (default: 0).
    use_pvlib : bool
        If ``True``, use *pvlib* when available (more accurate).
        Falls back to built-in formulas automatically.

    Examples
    --------
    >>> calc = SolarPositionCalculator(lat=40.0, lon=-105.0, elevation=1655)
    >>> times = pd.date_range('2025-01-01', periods=24, freq='1H')
    >>> zenith, azimuth = calc.compute_solar_position(times)
    >>> corrected = calc.apply_solar_correction(vod_data, times)

    """

    def __init__(
        self,
        lat: float,
        lon: float,
        elevation: float = 0.0,
        use_pvlib: bool = True,
    ) -> None:
        """Initialize the solar position calculator.

        Parameters
        ----------
        lat : float
            Observer latitude in degrees.
        lon : float
            Observer longitude in degrees.
        elevation : float, default 0.0
            Elevation above sea level in metres.
        use_pvlib : bool, default True
            Whether to use pvlib if available.

        """
        self.lat = lat
        self.lon = lon
        self.elevation = elevation
        self.use_pvlib = use_pvlib
        self.pvlib = None

        if use_pvlib:
            try:
                import pvlib

                self.pvlib = pvlib
                logger.info("Using pvlib for solar position calculations")
            except ImportError:
                logger.warning(
                    "pvlib not available, falling back to built-in formulas. "
                    "Install pvlib for higher accuracy: pip install pvlib"
                )
                self.use_pvlib = False

    # ------------------------------------------------------------------
    # Core position computation
    # ------------------------------------------------------------------

    def compute_solar_position(
        self, times: np.ndarray | pd.DatetimeIndex
    ) -> tuple[np.ndarray, np.ndarray]:
        """Compute solar zenith and azimuth angles.

        Parameters
        ----------
        times : np.ndarray or pd.DatetimeIndex
            Array of ``datetime64`` or ``DatetimeIndex``.

        Returns
        -------
        solar_zenith : np.ndarray
            Solar zenith angles in degrees (0° = directly overhead).
        solar_azimuth : np.ndarray
            Solar azimuth angles in degrees (0° = North, 90° = East).

        """
        if isinstance(times, np.ndarray):
            times = pd.to_datetime(times)

        if self.use_pvlib and self.pvlib is not None:
            return self._compute_solar_position_pvlib(times)
        return self._compute_solar_position_builtin(times)

    def _compute_solar_position_pvlib(
        self, times: pd.DatetimeIndex
    ) -> tuple[np.ndarray, np.ndarray]:
        """Compute solar position using pvlib (high accuracy)."""
        location = self.pvlib.location.Location(
            latitude=self.lat,
            longitude=self.lon,
            altitude=self.elevation,
            tz="UTC",
        )

        if times.tz is None:
            times = times.tz_localize("UTC")

        solar_position = location.get_solarposition(times)

        zenith = solar_position["apparent_zenith"].values
        azimuth = solar_position["azimuth"].values
        return zenith, azimuth

    def _compute_solar_position_builtin(
        self, times: pd.DatetimeIndex
    ) -> tuple[np.ndarray, np.ndarray]:
        """Compute solar position using built-in NOAA algorithms.

        Accuracy ~0.01° for years 1800–2100.
        """
        jd = self._datetime_to_julian_day(times)
        jc = (jd - 2451545.0) / 36525.0

        # Geometric mean longitude of sun (degrees)
        geom_mean_long = (280.46646 + jc * (36000.76983 + jc * 0.0003032)) % 360

        # Geometric mean anomaly of sun (degrees)
        geom_mean_anom = 357.52911 + jc * (35999.05029 - 0.0001537 * jc)

        # Eccentricity of Earth's orbit
        eccent = 0.016708634 - jc * (0.000042037 + 0.0000001267 * jc)

        # Sun equation of centre
        sun_eq_ctr = (
            np.sin(np.radians(geom_mean_anom))
            * (1.914602 - jc * (0.004817 + 0.000014 * jc))
            + np.sin(np.radians(2 * geom_mean_anom)) * (0.019993 - 0.000101 * jc)
            + np.sin(np.radians(3 * geom_mean_anom)) * 0.000289
        )

        # Sun true longitude (degrees)
        sun_true_long = geom_mean_long + sun_eq_ctr

        # Sun apparent longitude (degrees)
        sun_app_long = (
            sun_true_long
            - 0.00569
            - 0.00478 * np.sin(np.radians(125.04 - 1934.136 * jc))
        )

        # Mean obliquity of ecliptic (degrees)
        mean_obliq_ecliptic = (
            23
            + (26 + (21.448 - jc * (46.815 + jc * (0.00059 - jc * 0.001813))) / 60) / 60
        )

        # Obliquity correction (degrees)
        obliq_corr = mean_obliq_ecliptic + 0.00256 * np.cos(
            np.radians(125.04 - 1934.136 * jc)
        )

        # Sun declination (degrees)
        sun_decl = np.degrees(
            np.arcsin(np.sin(np.radians(obliq_corr)) * np.sin(np.radians(sun_app_long)))
        )

        # Equation of time (minutes)
        var_y = np.tan(np.radians(obliq_corr / 2)) ** 2
        eq_of_time = 4 * np.degrees(
            var_y * np.sin(2 * np.radians(geom_mean_long))
            - 2 * eccent * np.sin(np.radians(geom_mean_anom))
            + 4
            * eccent
            * var_y
            * np.sin(np.radians(geom_mean_anom))
            * np.cos(2 * np.radians(geom_mean_long))
            - 0.5 * var_y * var_y * np.sin(4 * np.radians(geom_mean_long))
            - 1.25 * eccent * eccent * np.sin(2 * np.radians(geom_mean_anom))
        )

        # True solar time (minutes)
        time_offset = eq_of_time + 4 * self.lon
        hour = times.hour + times.minute / 60.0 + times.second / 3600.0
        true_solar_time = (hour * 60 + time_offset) % 1440

        # Hour angle (degrees)
        hour_angle = true_solar_time / 4 - 180
        hour_angle = np.where(hour_angle < 0, hour_angle + 360, hour_angle)

        # Solar zenith angle (degrees)
        lat_rad = np.radians(self.lat)
        decl_rad = np.radians(sun_decl)
        ha_rad = np.radians(hour_angle)

        zenith = np.degrees(
            np.arccos(
                np.sin(lat_rad) * np.sin(decl_rad)
                + np.cos(lat_rad) * np.cos(decl_rad) * np.cos(ha_rad)
            )
        )

        # Solar azimuth angle (degrees from North)
        azimuth_rad = np.arccos(
            (np.sin(lat_rad) * np.cos(np.radians(zenith)) - np.sin(decl_rad))
            / (np.cos(lat_rad) * np.sin(np.radians(zenith)))
        )
        azimuth = np.degrees(azimuth_rad)

        # Adjust for morning vs afternoon
        azimuth = np.where(hour_angle > 0, azimuth, 360 - azimuth)

        return zenith, azimuth

    @staticmethod
    def _datetime_to_julian_day(times: pd.DatetimeIndex) -> np.ndarray:
        """Convert datetime to Julian Day Number."""
        dt = pd.to_datetime(times if not isinstance(times, pd.DatetimeIndex) else times)
        year = dt.year
        month = dt.month
        day = dt.day
        hour = dt.hour
        minute = dt.minute
        second = dt.second

        a = (14 - month) // 12
        y = year + 4800 - a
        m = month + 12 * a - 3

        jdn = day + (153 * m + 2) // 5 + 365 * y + y // 4 - y // 100 + y // 400 - 32045
        fraction = (hour - 12) / 24.0 + minute / 1440.0 + second / 86400.0

        return jdn + fraction

    # ------------------------------------------------------------------
    # Derived quantities
    # ------------------------------------------------------------------

    def compute_solar_elevation(
        self, times: np.ndarray | pd.DatetimeIndex
    ) -> np.ndarray:
        """Compute solar elevation angle (complementary to zenith).

        Parameters
        ----------
        times : np.ndarray or pd.DatetimeIndex
            Array of times.

        Returns
        -------
        np.ndarray
            Solar elevation angles in degrees (0° = horizon, 90° = overhead).

        """
        zenith, _ = self.compute_solar_position(times)
        return 90.0 - zenith

    def is_daytime(
        self,
        times: np.ndarray | pd.DatetimeIndex,
        twilight_angle: float = -6.0,
    ) -> np.ndarray:
        """Determine if times are during daytime.

        Parameters
        ----------
        times : np.ndarray or pd.DatetimeIndex
            Array of times.
        twilight_angle : float
            Solar elevation threshold in degrees.
            Common values: ``0`` (geometric), ``-6`` (civil),
            ``-12`` (nautical), ``-18`` (astronomical).

        Returns
        -------
        np.ndarray
            Boolean array (``True`` = daytime).

        """
        elevation = self.compute_solar_elevation(times)
        return elevation > twilight_angle

    # ------------------------------------------------------------------
    # Solar correction
    # ------------------------------------------------------------------

    def apply_solar_correction(
        self,
        data: xr.DataArray,
        method: Literal["normalize", "residual", "cos_correction"] = "normalize",
        reference_zenith: float = 45.0,
    ) -> xr.DataArray:
        """Apply solar correction to data.

        Parameters
        ----------
        data : xr.DataArray
            Input data with a time dimension (``'epoch'`` or ``'time'``).
        method : str
            Correction method:

            * ``'normalize'``      – normalise by cos(zenith) relative to
              *reference_zenith*.
            * ``'residual'``       – subtract a 4th-order polynomial fitted
              to the diurnal pattern (1-D data only; falls back to
              ``'normalize'`` for multi-dimensional data).
            * ``'cos_correction'`` – simple cosine correction.
        reference_zenith : float
            Reference zenith angle for normalisation (degrees).

        Returns
        -------
        xr.DataArray
            Solar-corrected data with correction metadata in attrs.

        """
        time_dim = "epoch" if "epoch" in data.dims else "time"
        times = pd.to_datetime(data[time_dim].values)

        zenith, _ = self.compute_solar_position(times)
        zenith_da = xr.DataArray(
            zenith,
            coords={time_dim: data[time_dim]},
            dims=[time_dim],
        )

        if method == "normalize":
            correction_factor = np.cos(np.radians(reference_zenith)) / np.cos(
                np.radians(zenith_da)
            )
            correction_factor = correction_factor.clip(0.5, 2.0)

            corrected = data * correction_factor
            corrected.attrs["solar_correction"] = "normalized"
            corrected.attrs["reference_zenith"] = reference_zenith

        elif method == "cos_correction":
            correction_factor = np.cos(np.radians(zenith_da))
            correction_factor = correction_factor.clip(0.1, 1.0)

            corrected = data / correction_factor
            corrected.attrs["solar_correction"] = "cos_correction"

        elif method == "residual":
            hour = times.hour + times.minute / 60.0

            if len(data.dims) == 1:
                valid = np.isfinite(data.values)
                if np.sum(valid) > 10:
                    coeffs = np.polyfit(hour[valid], data.values[valid], deg=4)
                    solar_model = np.polyval(coeffs, hour)
                    solar_model_da = xr.DataArray(
                        solar_model,
                        coords={time_dim: data[time_dim]},
                        dims=[time_dim],
                    )
                    corrected = data - solar_model_da
                    corrected.attrs["solar_correction"] = "residual"
                    corrected.attrs["polynomial_degree"] = 4
                else:
                    logger.warning("Insufficient data for residual correction")
                    corrected = data
            else:
                logger.warning(
                    "Residual correction for multi-dimensional data not yet "
                    "implemented, using normalize instead"
                )
                correction_factor = np.cos(np.radians(reference_zenith)) / np.cos(
                    np.radians(zenith_da)
                )
                correction_factor = correction_factor.clip(0.5, 2.0)
                corrected = data * correction_factor
                corrected.attrs["solar_correction"] = "normalize_fallback"
        else:
            raise ValueError(f"Unknown correction method: {method}")

        return corrected

    # ------------------------------------------------------------------
    # Binning & sunrise/sunset
    # ------------------------------------------------------------------

    def compute_solar_bins(
        self,
        times: np.ndarray | pd.DatetimeIndex,
        n_bins: int = 12,
    ) -> np.ndarray:
        """Bin times by solar elevation angle.

        Useful for solar-elevation-based composites instead of
        hour-of-day composites.

        Parameters
        ----------
        times : np.ndarray or pd.DatetimeIndex
            Array of times.
        n_bins : int
            Number of solar elevation bins (range −20° to 90°).

        Returns
        -------
        np.ndarray
            Bin indices (0-based) for each time.

        """
        elevation = self.compute_solar_elevation(times)
        bin_edges = np.linspace(-20, 90, n_bins + 1)
        bin_indices = np.digitize(elevation, bin_edges) - 1
        return np.clip(bin_indices, 0, n_bins - 1)

    def get_sunrise_sunset(
        self, date: datetime
    ) -> tuple[pd.Timestamp | None, pd.Timestamp | None]:
        """Compute sunrise and sunset times for a given date.

        Parameters
        ----------
        date : datetime
            Date to compute sunrise/sunset for.

        Returns
        -------
        sunrise : pd.Timestamp or None
            Sunrise time in UTC, or ``None`` if the sun never rises.
        sunset : pd.Timestamp or None
            Sunset time in UTC, or ``None`` if the sun never sets.

        """
        times = pd.date_range(
            start=date.replace(hour=0, minute=0, second=0),
            end=date.replace(hour=23, minute=59, second=59),
            freq="1min",
        )

        elevation = self.compute_solar_elevation(times)
        above_horizon = np.where(elevation > 0)[0]

        sunrise = times[above_horizon[0]] if len(above_horizon) > 0 else None
        sunset = times[above_horizon[-1]] if len(above_horizon) > 0 else None

        return sunrise, sunset

    # ------------------------------------------------------------------

    def __repr__(self) -> str:
        """Return the developer-facing representation.

        Returns
        -------
        str
            Representation string.

        """
        method = "pvlib" if self.use_pvlib else "builtin"
        return (
            f"SolarPositionCalculator(lat={self.lat:.4f}°, lon={self.lon:.4f}°, "
            f"elevation={self.elevation:.0f}m, method={method})"
        )

__init__(lat, lon, elevation=0.0, use_pvlib=True)

Initialize the solar position calculator.

Parameters

lat : float Observer latitude in degrees. lon : float Observer longitude in degrees. elevation : float, default 0.0 Elevation above sea level in metres. use_pvlib : bool, default True Whether to use pvlib if available.

Source code in packages/canvod-grids/src/canvod/grids/analysis/solar.py
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def __init__(
    self,
    lat: float,
    lon: float,
    elevation: float = 0.0,
    use_pvlib: bool = True,
) -> None:
    """Initialize the solar position calculator.

    Parameters
    ----------
    lat : float
        Observer latitude in degrees.
    lon : float
        Observer longitude in degrees.
    elevation : float, default 0.0
        Elevation above sea level in metres.
    use_pvlib : bool, default True
        Whether to use pvlib if available.

    """
    self.lat = lat
    self.lon = lon
    self.elevation = elevation
    self.use_pvlib = use_pvlib
    self.pvlib = None

    if use_pvlib:
        try:
            import pvlib

            self.pvlib = pvlib
            logger.info("Using pvlib for solar position calculations")
        except ImportError:
            logger.warning(
                "pvlib not available, falling back to built-in formulas. "
                "Install pvlib for higher accuracy: pip install pvlib"
            )
            self.use_pvlib = False

compute_solar_position(times)

Compute solar zenith and azimuth angles.

Parameters

times : np.ndarray or pd.DatetimeIndex Array of datetime64 or DatetimeIndex.

Returns

solar_zenith : np.ndarray Solar zenith angles in degrees (0° = directly overhead). solar_azimuth : np.ndarray Solar azimuth angles in degrees (0° = North, 90° = East).

Source code in packages/canvod-grids/src/canvod/grids/analysis/solar.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def compute_solar_position(
    self, times: np.ndarray | pd.DatetimeIndex
) -> tuple[np.ndarray, np.ndarray]:
    """Compute solar zenith and azimuth angles.

    Parameters
    ----------
    times : np.ndarray or pd.DatetimeIndex
        Array of ``datetime64`` or ``DatetimeIndex``.

    Returns
    -------
    solar_zenith : np.ndarray
        Solar zenith angles in degrees (0° = directly overhead).
    solar_azimuth : np.ndarray
        Solar azimuth angles in degrees (0° = North, 90° = East).

    """
    if isinstance(times, np.ndarray):
        times = pd.to_datetime(times)

    if self.use_pvlib and self.pvlib is not None:
        return self._compute_solar_position_pvlib(times)
    return self._compute_solar_position_builtin(times)

compute_solar_elevation(times)

Compute solar elevation angle (complementary to zenith).

Parameters

times : np.ndarray or pd.DatetimeIndex Array of times.

Returns

np.ndarray Solar elevation angles in degrees (0° = horizon, 90° = overhead).

Source code in packages/canvod-grids/src/canvod/grids/analysis/solar.py
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
def compute_solar_elevation(
    self, times: np.ndarray | pd.DatetimeIndex
) -> np.ndarray:
    """Compute solar elevation angle (complementary to zenith).

    Parameters
    ----------
    times : np.ndarray or pd.DatetimeIndex
        Array of times.

    Returns
    -------
    np.ndarray
        Solar elevation angles in degrees (0° = horizon, 90° = overhead).

    """
    zenith, _ = self.compute_solar_position(times)
    return 90.0 - zenith

is_daytime(times, twilight_angle=-6.0)

Determine if times are during daytime.

Parameters

times : np.ndarray or pd.DatetimeIndex Array of times. twilight_angle : float Solar elevation threshold in degrees. Common values: 0 (geometric), -6 (civil), -12 (nautical), -18 (astronomical).

Returns

np.ndarray Boolean array (True = daytime).

Source code in packages/canvod-grids/src/canvod/grids/analysis/solar.py
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
def is_daytime(
    self,
    times: np.ndarray | pd.DatetimeIndex,
    twilight_angle: float = -6.0,
) -> np.ndarray:
    """Determine if times are during daytime.

    Parameters
    ----------
    times : np.ndarray or pd.DatetimeIndex
        Array of times.
    twilight_angle : float
        Solar elevation threshold in degrees.
        Common values: ``0`` (geometric), ``-6`` (civil),
        ``-12`` (nautical), ``-18`` (astronomical).

    Returns
    -------
    np.ndarray
        Boolean array (``True`` = daytime).

    """
    elevation = self.compute_solar_elevation(times)
    return elevation > twilight_angle

apply_solar_correction(data, method='normalize', reference_zenith=45.0)

Apply solar correction to data.

Parameters

data : xr.DataArray Input data with a time dimension ('epoch' or 'time'). method : str Correction method:

* ``'normalize'``      – normalise by cos(zenith) relative to
  *reference_zenith*.
* ``'residual'``       – subtract a 4th-order polynomial fitted
  to the diurnal pattern (1-D data only; falls back to
  ``'normalize'`` for multi-dimensional data).
* ``'cos_correction'`` – simple cosine correction.

reference_zenith : float Reference zenith angle for normalisation (degrees).

Returns

xr.DataArray Solar-corrected data with correction metadata in attrs.

Source code in packages/canvod-grids/src/canvod/grids/analysis/solar.py
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
def apply_solar_correction(
    self,
    data: xr.DataArray,
    method: Literal["normalize", "residual", "cos_correction"] = "normalize",
    reference_zenith: float = 45.0,
) -> xr.DataArray:
    """Apply solar correction to data.

    Parameters
    ----------
    data : xr.DataArray
        Input data with a time dimension (``'epoch'`` or ``'time'``).
    method : str
        Correction method:

        * ``'normalize'``      – normalise by cos(zenith) relative to
          *reference_zenith*.
        * ``'residual'``       – subtract a 4th-order polynomial fitted
          to the diurnal pattern (1-D data only; falls back to
          ``'normalize'`` for multi-dimensional data).
        * ``'cos_correction'`` – simple cosine correction.
    reference_zenith : float
        Reference zenith angle for normalisation (degrees).

    Returns
    -------
    xr.DataArray
        Solar-corrected data with correction metadata in attrs.

    """
    time_dim = "epoch" if "epoch" in data.dims else "time"
    times = pd.to_datetime(data[time_dim].values)

    zenith, _ = self.compute_solar_position(times)
    zenith_da = xr.DataArray(
        zenith,
        coords={time_dim: data[time_dim]},
        dims=[time_dim],
    )

    if method == "normalize":
        correction_factor = np.cos(np.radians(reference_zenith)) / np.cos(
            np.radians(zenith_da)
        )
        correction_factor = correction_factor.clip(0.5, 2.0)

        corrected = data * correction_factor
        corrected.attrs["solar_correction"] = "normalized"
        corrected.attrs["reference_zenith"] = reference_zenith

    elif method == "cos_correction":
        correction_factor = np.cos(np.radians(zenith_da))
        correction_factor = correction_factor.clip(0.1, 1.0)

        corrected = data / correction_factor
        corrected.attrs["solar_correction"] = "cos_correction"

    elif method == "residual":
        hour = times.hour + times.minute / 60.0

        if len(data.dims) == 1:
            valid = np.isfinite(data.values)
            if np.sum(valid) > 10:
                coeffs = np.polyfit(hour[valid], data.values[valid], deg=4)
                solar_model = np.polyval(coeffs, hour)
                solar_model_da = xr.DataArray(
                    solar_model,
                    coords={time_dim: data[time_dim]},
                    dims=[time_dim],
                )
                corrected = data - solar_model_da
                corrected.attrs["solar_correction"] = "residual"
                corrected.attrs["polynomial_degree"] = 4
            else:
                logger.warning("Insufficient data for residual correction")
                corrected = data
        else:
            logger.warning(
                "Residual correction for multi-dimensional data not yet "
                "implemented, using normalize instead"
            )
            correction_factor = np.cos(np.radians(reference_zenith)) / np.cos(
                np.radians(zenith_da)
            )
            correction_factor = correction_factor.clip(0.5, 2.0)
            corrected = data * correction_factor
            corrected.attrs["solar_correction"] = "normalize_fallback"
    else:
        raise ValueError(f"Unknown correction method: {method}")

    return corrected

compute_solar_bins(times, n_bins=12)

Bin times by solar elevation angle.

Useful for solar-elevation-based composites instead of hour-of-day composites.

Parameters

times : np.ndarray or pd.DatetimeIndex Array of times. n_bins : int Number of solar elevation bins (range −20° to 90°).

Returns

np.ndarray Bin indices (0-based) for each time.

Source code in packages/canvod-grids/src/canvod/grids/analysis/solar.py
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
def compute_solar_bins(
    self,
    times: np.ndarray | pd.DatetimeIndex,
    n_bins: int = 12,
) -> np.ndarray:
    """Bin times by solar elevation angle.

    Useful for solar-elevation-based composites instead of
    hour-of-day composites.

    Parameters
    ----------
    times : np.ndarray or pd.DatetimeIndex
        Array of times.
    n_bins : int
        Number of solar elevation bins (range −20° to 90°).

    Returns
    -------
    np.ndarray
        Bin indices (0-based) for each time.

    """
    elevation = self.compute_solar_elevation(times)
    bin_edges = np.linspace(-20, 90, n_bins + 1)
    bin_indices = np.digitize(elevation, bin_edges) - 1
    return np.clip(bin_indices, 0, n_bins - 1)

get_sunrise_sunset(date)

Compute sunrise and sunset times for a given date.

Parameters

date : datetime Date to compute sunrise/sunset for.

Returns

sunrise : pd.Timestamp or None Sunrise time in UTC, or None if the sun never rises. sunset : pd.Timestamp or None Sunset time in UTC, or None if the sun never sets.

Source code in packages/canvod-grids/src/canvod/grids/analysis/solar.py
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
def get_sunrise_sunset(
    self, date: datetime
) -> tuple[pd.Timestamp | None, pd.Timestamp | None]:
    """Compute sunrise and sunset times for a given date.

    Parameters
    ----------
    date : datetime
        Date to compute sunrise/sunset for.

    Returns
    -------
    sunrise : pd.Timestamp or None
        Sunrise time in UTC, or ``None`` if the sun never rises.
    sunset : pd.Timestamp or None
        Sunset time in UTC, or ``None`` if the sun never sets.

    """
    times = pd.date_range(
        start=date.replace(hour=0, minute=0, second=0),
        end=date.replace(hour=23, minute=59, second=59),
        freq="1min",
    )

    elevation = self.compute_solar_elevation(times)
    above_horizon = np.where(elevation > 0)[0]

    sunrise = times[above_horizon[0]] if len(above_horizon) > 0 else None
    sunset = times[above_horizon[-1]] if len(above_horizon) > 0 else None

    return sunrise, sunset

__repr__()

Return the developer-facing representation.

Returns

str Representation string.

Source code in packages/canvod-grids/src/canvod/grids/analysis/solar.py
481
482
483
484
485
486
487
488
489
490
491
492
493
494
def __repr__(self) -> str:
    """Return the developer-facing representation.

    Returns
    -------
    str
        Representation string.

    """
    method = "pvlib" if self.use_pvlib else "builtin"
    return (
        f"SolarPositionCalculator(lat={self.lat:.4f}°, lon={self.lon:.4f}°, "
        f"elevation={self.elevation:.0f}m, method={method})"
    )

compute_solar_zenith(lat, lon, times)

Quick computation of solar zenith angles.

Parameters

lat : float Observer latitude in degrees. lon : float Observer longitude in degrees. times : np.ndarray or pd.DatetimeIndex Times to compute for.

Returns

np.ndarray Solar zenith angles in degrees.

Source code in packages/canvod-grids/src/canvod/grids/analysis/solar.py
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
def compute_solar_zenith(
    lat: float, lon: float, times: np.ndarray | pd.DatetimeIndex
) -> np.ndarray:
    """Quick computation of solar zenith angles.

    Parameters
    ----------
    lat : float
        Observer latitude in degrees.
    lon : float
        Observer longitude in degrees.
    times : np.ndarray or pd.DatetimeIndex
        Times to compute for.

    Returns
    -------
    np.ndarray
        Solar zenith angles in degrees.

    """
    calc = SolarPositionCalculator(lat, lon)
    zenith, _ = calc.compute_solar_position(times)
    return zenith

filter_daytime_data(data, lat, lon, twilight_angle=-6.0)

Filter data to include only daytime observations.

Nighttime values are set to NaN.

Parameters

data : xr.DataArray Data with a time dimension ('epoch' or 'time'). lat : float Observer latitude in degrees. lon : float Observer longitude in degrees. twilight_angle : float Elevation threshold for daytime (degrees below horizon).

Returns

xr.DataArray Filtered data.

Source code in packages/canvod-grids/src/canvod/grids/analysis/solar.py
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
def filter_daytime_data(
    data: xr.DataArray,
    lat: float,
    lon: float,
    twilight_angle: float = -6.0,
) -> xr.DataArray:
    """Filter data to include only daytime observations.

    Nighttime values are set to NaN.

    Parameters
    ----------
    data : xr.DataArray
        Data with a time dimension (``'epoch'`` or ``'time'``).
    lat : float
        Observer latitude in degrees.
    lon : float
        Observer longitude in degrees.
    twilight_angle : float
        Elevation threshold for daytime (degrees below horizon).

    Returns
    -------
    xr.DataArray
        Filtered data.

    """
    calc = SolarPositionCalculator(lat, lon)
    time_dim = "epoch" if "epoch" in data.dims else "time"
    times = pd.to_datetime(data[time_dim].values)
    is_day = calc.is_daytime(times, twilight_angle)
    return data.where(is_day)

Temporal Analysis

Temporal analysis of gridded VOD data.

Weighted time-series computation, diurnal cycle analysis, and temporal statistics with optional solar-position correction.

Classes

TemporalAnalysis Main analysis class; binds a VOD dataset to a grid and exposes methods for aggregation, solar correction, diurnal binning, and basic plotting.

Notes

  • All spatial masks and weight arrays must be 1-D with length grid.ncells.
  • When a SolarPositionCalculator is attached (via site_lat / site_lon), additional solar-corrected and solar-binned methods become available.
  • Plotting helpers are thin wrappers around matplotlib; they return (fig, ax) so callers can continue customising the figure.

TemporalAnalysis

Temporal analysis of gridded VOD data.

Binds a VOD dataset (with pre-assigned cell IDs) to a grid and exposes weighted aggregation, diurnal analysis, and plotting.

Parameters

vod_ds : xr.Dataset Dataset containing VOD data and a cell_id_<grid_name> variable. grid : GridData Grid instance (must expose .ncells). grid_name : str Suffix for the cell-ID variable (e.g. 'htm_10deg'). site_lat : float or None, optional Site latitude in degrees. Required for solar methods. site_lon : float or None, optional Site longitude in degrees. Required for solar methods. site_elevation : float, optional Site elevation in metres (default 0).

Raises

ValueError If cell_id_<grid_name> is not present in vod_ds.

Examples

analysis = TemporalAnalysis(vod_ds, grid, "htm_10deg") ts = analysis.compute_timeseries(aggregate="1D")

Source code in packages/canvod-grids/src/canvod/grids/analysis/temporal.py
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
class TemporalAnalysis:
    """Temporal analysis of gridded VOD data.

    Binds a VOD dataset (with pre-assigned cell IDs) to a grid and
    exposes weighted aggregation, diurnal analysis, and plotting.

    Parameters
    ----------
    vod_ds : xr.Dataset
        Dataset containing VOD data and a ``cell_id_<grid_name>``
        variable.
    grid : GridData
        Grid instance (must expose ``.ncells``).
    grid_name : str
        Suffix for the cell-ID variable (e.g. ``'htm_10deg'``).
    site_lat : float or None, optional
        Site latitude in degrees.  Required for solar methods.
    site_lon : float or None, optional
        Site longitude in degrees.  Required for solar methods.
    site_elevation : float, optional
        Site elevation in metres (default 0).

    Raises
    ------
    ValueError
        If ``cell_id_<grid_name>`` is not present in *vod_ds*.

    Examples
    --------
    >>> analysis = TemporalAnalysis(vod_ds, grid, "htm_10deg")
    >>> ts = analysis.compute_timeseries(aggregate="1D")

    """

    def __init__(
        self,
        vod_ds: xr.Dataset,
        grid: GridData,
        grid_name: str,
        site_lat: float | None = None,
        site_lon: float | None = None,
        site_elevation: float = 0.0,
    ) -> None:
        """Initialize the temporal analysis helper.

        Parameters
        ----------
        vod_ds : xr.Dataset
            VOD dataset containing cell IDs.
        grid : GridData
            Grid instance.
        grid_name : str
            Grid name suffix for cell IDs.
        site_lat : float | None, optional
            Site latitude in degrees.
        site_lon : float | None, optional
            Site longitude in degrees.
        site_elevation : float, default 0.0
            Site elevation in metres.

        """
        self.vod_ds = vod_ds
        self.grid = grid
        self.grid_name = grid_name
        self.cell_id_var = f"cell_id_{grid_name}"

        # Solar calculator (optional)
        if site_lat is not None and site_lon is not None:
            self.solar_calc: SolarPositionCalculator | None = SolarPositionCalculator(
                lat=site_lat, lon=site_lon, elevation=site_elevation
            )
            logger.info(
                "solar calculator enabled for (%.4f°, %.4f°)",
                site_lat,
                site_lon,
            )
        else:
            self.solar_calc = None

        # Validate dataset
        if self.cell_id_var not in vod_ds:
            available = [v for v in vod_ds.data_vars if v.startswith("cell_id_")]
            raise ValueError(
                f"Cell ID variable '{self.cell_id_var}' not found in dataset. "
                f"Available: {available}"
            )

    # ------------------------------------------------------------------
    # Core aggregation
    # ------------------------------------------------------------------

    def compute_timeseries(
        self,
        var_name: str = "VOD",
        spatial_mask: np.ndarray | None = None,
        weights: np.ndarray | None = None,
        aggregate: str = "1D",
        min_cells: int = 1,
    ) -> xr.Dataset:
        """Compute a weighted time-series aggregated over space.

        Parameters
        ----------
        var_name : str, optional
            Data variable to aggregate.
        spatial_mask : np.ndarray or None, optional
            Boolean mask of shape ``(grid.ncells,)``; ``True`` = include.
        weights : np.ndarray or None, optional
            Cell weights of shape ``(grid.ncells,)``; normalised
            internally.  ``None`` → uniform weights.
        aggregate : str, optional
            Pandas-compatible frequency string for temporal resampling.
        min_cells : int, optional
            Minimum unique cells required per time bin.

        Returns
        -------
        xr.Dataset
            Variables: ``mean``, ``std``, ``n_cells``,
            ``n_observations``, ``sum_weights``.

        Raises
        ------
        ValueError
            If *var_name* is missing or mask/weight shapes are wrong.

        """
        if var_name not in self.vod_ds:
            raise ValueError(f"Variable '{var_name}' not found in dataset")

        logger.info(
            "compute_timeseries: var=%s aggregate=%s mask=%s weights=%s",
            var_name,
            aggregate,
            spatial_mask is not None,
            weights is not None,
        )

        var_data = self.vod_ds[var_name]
        cell_ids = self.vod_ds[self.cell_id_var]

        # Apply spatial mask
        if spatial_mask is not None:
            if spatial_mask.shape != (self.grid.ncells,):
                raise ValueError(
                    f"Spatial mask shape {spatial_mask.shape} doesn't match "
                    f"grid size ({self.grid.ncells},)"
                )
            data_mask = xr.zeros_like(cell_ids, dtype=bool)
            for cell_id in np.where(spatial_mask)[0]:
                data_mask = data_mask | (cell_ids == cell_id)
            var_data = var_data.where(data_mask)
            logger.debug(
                "spatial mask applied: %d/%d cells",
                spatial_mask.sum(),
                self.grid.ncells,
            )

        # Prepare weights
        weights = self._prepare_weights(weights)

        return self._compute_weighted_timeseries(
            var_data, cell_ids, weights, aggregate, min_cells
        )

    # ------------------------------------------------------------------
    # Solar-corrected aggregation
    # ------------------------------------------------------------------

    def compute_timeseries_solar_corrected(
        self,
        var_name: str = "VOD",
        spatial_mask: np.ndarray | None = None,
        weights: np.ndarray | None = None,
        aggregate: str = "1D",
        min_cells: int = 1,
        solar_correction: Literal[
            "normalize", "residual", "cos_correction"
        ] = "normalize",
        reference_zenith: float = 45.0,
        daytime_only: bool = False,
        twilight_angle: float = -6.0,
    ) -> xr.Dataset:
        """Compute a time-series after applying a solar correction.

        Parameters
        ----------
        var_name : str, optional
            Data variable to correct and aggregate.
        spatial_mask : np.ndarray or None, optional
            Cell selection mask.
        weights : np.ndarray or None, optional
            Cell weights.
        aggregate : str, optional
            Temporal resampling frequency.
        min_cells : int, optional
            Minimum cells per time bin.
        solar_correction : {'normalize', 'residual', 'cos_correction'}
            Correction method passed to
            :meth:`SolarPositionCalculator.apply_solar_correction`.
        reference_zenith : float, optional
            Reference zenith for normalisation (degrees).
        daytime_only : bool, optional
            If ``True``, mask out nighttime epochs.
        twilight_angle : float, optional
            Solar-elevation threshold for daytime (degrees).

        Returns
        -------
        xr.Dataset
            Solar-corrected time-series with additional metadata attrs.

        Raises
        ------
        ValueError
            If no solar calculator is configured.

        """
        if self.solar_calc is None:
            raise ValueError(
                "Solar calculator not initialized. "
                "Provide site_lat and site_lon to TemporalAnalysis constructor."
            )

        logger.info(
            "solar-corrected timeseries: correction=%s daytime_only=%s",
            solar_correction,
            daytime_only,
        )

        var_data = self.vod_ds[var_name]

        # Apply solar correction
        var_data_corrected = self.solar_calc.apply_solar_correction(
            var_data, method=solar_correction, reference_zenith=reference_zenith
        )

        # Daytime filter
        if daytime_only:
            times = pd.to_datetime(var_data["epoch"].values)
            is_day = self.solar_calc.is_daytime(times, twilight_angle)
            is_day_da = xr.DataArray(
                is_day, coords={"epoch": var_data_corrected["epoch"]}, dims=["epoch"]
            )
            var_data_corrected = var_data_corrected.where(is_day_da)
            logger.debug(
                "daytime filter: %d/%d timesteps kept",
                is_day.sum(),
                len(is_day),
            )

        # Temporary dataset with corrected variable
        corrected_name = f"{var_name}_solar_corrected"
        ds_temp = self.vod_ds.copy()
        ds_temp[corrected_name] = var_data_corrected

        # Reuse compute_timeseries via a lightweight temporary instance
        analysis_temp = TemporalAnalysis.__new__(TemporalAnalysis)
        analysis_temp.vod_ds = ds_temp
        analysis_temp.grid = self.grid
        analysis_temp.grid_name = self.grid_name
        analysis_temp.cell_id_var = self.cell_id_var
        analysis_temp.solar_calc = self.solar_calc

        ts = analysis_temp.compute_timeseries(
            var_name=corrected_name,
            spatial_mask=spatial_mask,
            weights=weights,
            aggregate=aggregate,
            min_cells=min_cells,
        )

        # Solar metadata
        ts.attrs["solar_correction"] = solar_correction
        ts.attrs["reference_zenith"] = reference_zenith
        ts.attrs["daytime_only"] = daytime_only
        if daytime_only:
            ts.attrs["twilight_angle"] = twilight_angle

        return ts

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    def _prepare_weights(self, weights: np.ndarray | None) -> np.ndarray:
        """Validate and normalise a weight array.

        Returns uniform weights when *weights* is ``None``.
        """
        if weights is None:
            logger.debug("using uniform weights")
            return np.ones(self.grid.ncells) / self.grid.ncells

        if weights.shape != (self.grid.ncells,):
            raise ValueError(
                f"Weights shape {weights.shape} doesn't match "
                f"grid size ({self.grid.ncells},)"
            )
        w_sum = weights.sum()
        if w_sum > 0:
            weights = weights / w_sum
        logger.debug("using provided weights (sum before normalisation=%.4f)", w_sum)
        return weights

    def _compute_weighted_timeseries(
        self,
        var_data: xr.DataArray,
        cell_ids: xr.DataArray,
        weights: np.ndarray,
        aggregate: str,
        min_cells: int,
    ) -> xr.Dataset:
        """Aggregate *var_data* into time bins with cell weights.

        Returns
        -------
        xr.Dataset
            ``mean``, ``std``, ``n_cells``, ``n_observations``,
            ``sum_weights`` on the ``epoch`` dimension.

        """
        times = var_data["epoch"].values
        n_sid = var_data.sizes.get("sid", 1)

        # Flatten (epoch × sid) → 1-D
        values = var_data.values.ravel()
        cells = cell_ids.values.ravel()

        valid = np.isfinite(values) & np.isfinite(cells)
        times_valid = np.repeat(times, n_sid)[valid]
        values_valid = values[valid]
        cells_valid = cells[valid].astype(int)

        df = pd.DataFrame(
            {"epoch": times_valid, "value": values_valid, "cell_id": cells_valid}
        )
        df["weight"] = df["cell_id"].map(
            lambda cid: weights[cid] if cid < len(weights) else 0.0
        )
        df["epoch"] = pd.to_datetime(df["epoch"])
        df = df.set_index("epoch")

        grouped = df.groupby(pd.Grouper(freq=aggregate))

        result_rows: list[dict] = []
        for time_bin, group in grouped:
            if len(group) == 0:
                continue
            n_cells = group["cell_id"].nunique()
            if n_cells < min_cells:
                continue

            w = group["weight"].values
            v = group["value"].values
            w_sum = w.sum()

            if w_sum > 0:
                weighted_mean = np.average(v, weights=w)
                weighted_std = np.sqrt(np.average((v - weighted_mean) ** 2, weights=w))
                result_rows.append(
                    {
                        "epoch": time_bin,
                        "mean": weighted_mean,
                        "std": weighted_std,
                        "n_cells": n_cells,
                        "n_observations": len(group),
                        "sum_weights": w_sum,
                    }
                )

        if not result_rows:
            logger.warning("no data after aggregation")
            return xr.Dataset()

        result_df = pd.DataFrame(result_rows)
        ds = xr.Dataset(
            {
                "mean": ("epoch", result_df["mean"].values),
                "std": ("epoch", result_df["std"].values),
                "n_cells": ("epoch", result_df["n_cells"].values),
                "n_observations": ("epoch", result_df["n_observations"].values),
                "sum_weights": ("epoch", result_df["sum_weights"].values),
            },
            coords={"epoch": result_df["epoch"].values},
        )
        ds.attrs["variable"] = var_data.name
        ds.attrs["grid"] = self.grid_name
        ds.attrs["aggregation"] = aggregate
        ds.attrs["min_cells"] = min_cells

        logger.info(
            "timeseries computed: %d steps, mean n_cells=%.1f",
            len(result_df),
            result_df["n_cells"].mean(),
        )
        return ds

    # ------------------------------------------------------------------
    # Diurnal cycle
    # ------------------------------------------------------------------

    def compute_diurnal_cycle(
        self,
        var_name: str = "VOD",
        spatial_mask: np.ndarray | None = None,
        weights: np.ndarray | None = None,
        hour_bins: int = 24,
        min_observations: int = 10,
    ) -> xr.Dataset:
        """Compute clock-time diurnal cycle (hour-of-day statistics).

        Parameters
        ----------
        var_name : str, optional
            Data variable to bin.
        spatial_mask : np.ndarray or None, optional
            Cell selection mask.
        weights : np.ndarray or None, optional
            Cell weights.
        hour_bins : int, optional
            Number of equal-width hour bins over [0, 24).
        min_observations : int, optional
            Minimum observations required per bin.

        Returns
        -------
        xr.Dataset
            ``mean``, ``std``, ``n_observations`` on the ``hour``
            coordinate.

        """
        if var_name not in self.vod_ds:
            raise ValueError(f"Variable '{var_name}' not found in dataset")

        logger.info("compute_diurnal_cycle: var=%s hour_bins=%d", var_name, hour_bins)

        var_data = self.vod_ds[var_name]
        cell_ids = self.vod_ds[self.cell_id_var]

        # Spatial mask
        if spatial_mask is not None:
            data_mask = xr.zeros_like(cell_ids, dtype=bool)
            for cell_id in np.where(spatial_mask)[0]:
                data_mask = data_mask | (cell_ids == cell_id)
            var_data = var_data.where(data_mask)

        weights = self._prepare_weights(weights)

        # Hour of day (fractional)
        times = pd.to_datetime(var_data["epoch"].values)
        hours = times.hour + times.minute / 60.0

        n_sid = var_data.sizes.get("sid", 1)
        hour_edges = np.linspace(0, 24, hour_bins + 1)
        hour_centers = (hour_edges[:-1] + hour_edges[1:]) / 2

        # Flatten
        values = var_data.values.ravel()
        cells = cell_ids.values.ravel()
        hours_flat = np.repeat(hours, n_sid)

        valid = np.isfinite(values) & np.isfinite(cells)
        df = pd.DataFrame(
            {
                "hour": hours_flat[valid],
                "value": values[valid],
                "cell_id": cells[valid].astype(int),
            }
        )
        df["weight"] = df["cell_id"].map(
            lambda cid: weights[cid] if cid < len(weights) else 0.0
        )
        df["hour_bin"] = pd.cut(
            df["hour"], bins=hour_edges, labels=hour_centers, include_lowest=True
        )

        grouped = df.groupby("hour_bin")

        means, stds, n_obs = [], [], []
        for hc in hour_centers:
            if hc in grouped.groups:
                group = grouped.get_group(hc)
                if len(group) >= min_observations:
                    w = group["weight"].values
                    v = group["value"].values
                    if w.sum() > 0:
                        wm = np.average(v, weights=w)
                        ws = np.sqrt(np.average((v - wm) ** 2, weights=w))
                    else:
                        wm, ws = np.nan, np.nan
                    means.append(wm)
                    stds.append(ws)
                    n_obs.append(len(group))
                else:
                    means.append(np.nan)
                    stds.append(np.nan)
                    n_obs.append(0)
            else:
                means.append(np.nan)
                stds.append(np.nan)
                n_obs.append(0)

        ds = xr.Dataset(
            {
                "mean": ("hour", np.array(means)),
                "std": ("hour", np.array(stds)),
                "n_observations": ("hour", np.array(n_obs)),
            },
            coords={"hour": hour_centers},
        )
        ds.attrs.update(
            {
                "variable": var_name,
                "grid": self.grid_name,
                "hour_bins": hour_bins,
                "min_observations": min_observations,
            }
        )
        logger.info(
            "diurnal cycle: %d bins, mean n_obs=%.1f",
            hour_bins,
            np.nanmean(n_obs),
        )
        return ds

    def compute_diurnal_cycle_solar(
        self,
        var_name: str = "VOD",
        spatial_mask: np.ndarray | None = None,
        weights: np.ndarray | None = None,
        n_solar_bins: int = 12,
        min_observations: int = 10,
    ) -> xr.Dataset:
        """Diurnal cycle binned by solar elevation instead of clock time.

        Accounts for seasonal variation in solar position, producing a
        more physically meaningful diurnal pattern.

        Parameters
        ----------
        var_name : str, optional
            Data variable to bin.
        spatial_mask : np.ndarray or None, optional
            Cell selection mask.
        weights : np.ndarray or None, optional
            Cell weights.
        n_solar_bins : int, optional
            Number of equal-width bins over [-20°, 90°].
        min_observations : int, optional
            Minimum observations per bin.

        Returns
        -------
        xr.Dataset
            ``mean``, ``std``, ``n_observations`` on the
            ``solar_elevation`` coordinate.

        Raises
        ------
        ValueError
            If no solar calculator is configured.

        """
        if self.solar_calc is None:
            raise ValueError(
                "Solar calculator not initialized. "
                "Provide site_lat and site_lon to TemporalAnalysis constructor."
            )

        logger.info("solar-binned diurnal cycle: n_bins=%d", n_solar_bins)

        var_data = self.vod_ds[var_name]
        cell_ids = self.vod_ds[self.cell_id_var]

        # Spatial mask
        if spatial_mask is not None:
            data_mask = xr.zeros_like(cell_ids, dtype=bool)
            for cell_id in np.where(spatial_mask)[0]:
                data_mask = data_mask | (cell_ids == cell_id)
            var_data = var_data.where(data_mask)

        weights = self._prepare_weights(weights)

        # Solar bins per epoch
        times = pd.to_datetime(var_data["epoch"].values)
        solar_bins = self.solar_calc.compute_solar_bins(times, n_bins=n_solar_bins)
        bin_edges = np.linspace(-20, 90, n_solar_bins + 1)
        bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

        n_sid = var_data.sizes.get("sid", 1)

        # Flatten
        values = var_data.values.ravel()
        cells = cell_ids.values.ravel()
        bins_flat = np.repeat(solar_bins, n_sid)

        valid = np.isfinite(values) & np.isfinite(cells)
        df = pd.DataFrame(
            {
                "solar_bin": bins_flat[valid],
                "value": values[valid],
                "cell_id": cells[valid].astype(int),
            }
        )
        df["weight"] = df["cell_id"].map(
            lambda cid: weights[cid] if cid < len(weights) else 0.0
        )

        grouped = df.groupby("solar_bin")

        means, stds, n_obs = [], [], []
        for bin_idx in range(n_solar_bins):
            if bin_idx in grouped.groups:
                group = grouped.get_group(bin_idx)
                if len(group) >= min_observations:
                    w = group["weight"].values
                    v = group["value"].values
                    if w.sum() > 0:
                        wm = np.average(v, weights=w)
                        ws = np.sqrt(np.average((v - wm) ** 2, weights=w))
                    else:
                        wm, ws = np.nan, np.nan
                    means.append(wm)
                    stds.append(ws)
                    n_obs.append(len(group))
                else:
                    means.append(np.nan)
                    stds.append(np.nan)
                    n_obs.append(0)
            else:
                means.append(np.nan)
                stds.append(np.nan)
                n_obs.append(0)

        ds = xr.Dataset(
            {
                "mean": ("solar_elevation", np.array(means)),
                "std": ("solar_elevation", np.array(stds)),
                "n_observations": ("solar_elevation", np.array(n_obs)),
            },
            coords={"solar_elevation": bin_centers},
        )
        ds.attrs.update(
            {
                "variable": var_name,
                "grid": self.grid_name,
                "n_solar_bins": n_solar_bins,
                "min_observations": min_observations,
                "coordinate_type": "solar_elevation",
            }
        )
        logger.info(
            "solar-binned diurnal: %d bins, mean n_obs=%.1f",
            n_solar_bins,
            np.nanmean(n_obs),
        )
        return ds

    # ------------------------------------------------------------------
    # Solar metadata
    # ------------------------------------------------------------------

    def add_solar_metadata_to_timeseries(self, timeseries: xr.Dataset) -> xr.Dataset:
        """Attach solar zenith, azimuth and elevation to a time-series.

        Parameters
        ----------
        timeseries : xr.Dataset
            Time-series dataset with an ``epoch`` coordinate.

        Returns
        -------
        xr.Dataset
            Copy with ``solar_zenith``, ``solar_azimuth``,
            ``solar_elevation`` added.

        Raises
        ------
        ValueError
            If no solar calculator is configured.

        """
        if self.solar_calc is None:
            raise ValueError("Solar calculator not initialized")

        times = pd.to_datetime(timeseries["epoch"].values)
        zenith, azimuth = self.solar_calc.compute_solar_position(times)
        elevation = 90 - zenith

        ts_solar = timeseries.copy()
        ts_solar["solar_zenith"] = ("epoch", zenith)
        ts_solar["solar_azimuth"] = ("epoch", azimuth)
        ts_solar["solar_elevation"] = ("epoch", elevation)

        ts_solar["solar_zenith"].attrs = {
            "units": "degrees",
            "description": "Solar zenith angle (0° = overhead)",
        }
        ts_solar["solar_azimuth"].attrs = {
            "units": "degrees",
            "description": "Solar azimuth angle (0° = North, 90° = East)",
        }
        ts_solar["solar_elevation"].attrs = {
            "units": "degrees",
            "description": "Solar elevation angle (0° = horizon, 90° = overhead)",
        }
        return ts_solar

    # ------------------------------------------------------------------
    # Plotting
    # ------------------------------------------------------------------

    def plot_timeseries(
        self,
        timeseries: xr.Dataset,
        smooth_window: int = 0,
        show_uncertainty: bool = True,
        show_n_cells: bool = False,
        ax: plt.Axes | None = None,
        **style_kwargs: Any,
    ) -> tuple[plt.Figure, plt.Axes]:
        """Plot a time-series with optional Savitzky-Golay smoothing.

        Parameters
        ----------
        timeseries : xr.Dataset
            Output of :meth:`compute_timeseries`.
        smooth_window : int, optional
            Savitzky-Golay window length (0 = off; forced odd internally).
        show_uncertainty : bool, optional
            Draw ±1 std band.
        show_n_cells : bool, optional
            Secondary y-axis showing cell count.
        ax : plt.Axes or None, optional
            Axes to draw on; created if ``None``.
        **style_kwargs
            ``ylabel``, ``title``, ``figsize`` forwarded to matplotlib.

        Returns
        -------
        fig, ax : plt.Figure, plt.Axes

        """
        if ax is None:
            figsize = style_kwargs.pop("figsize", (12, 6))
            fig, ax = plt.subplots(figsize=figsize)
        else:
            fig = ax.figure

        time = timeseries["epoch"].values
        mean = timeseries["mean"].values
        std = timeseries["std"].values

        if smooth_window > 0:
            if smooth_window % 2 == 0:
                smooth_window += 1
            valid = np.isfinite(mean)
            if np.sum(valid) > smooth_window:
                mean_smooth = mean.copy()
                mean_smooth[valid] = savgol_filter(
                    mean[valid], smooth_window, polyorder=2
                )
                ax.plot(
                    time,
                    mean,
                    "o",
                    alpha=0.3,
                    label="Raw",
                    markersize=3,
                    color="gray",
                )
                ax.plot(
                    time,
                    mean_smooth,
                    "-",
                    label=f"Smoothed (window={smooth_window})",
                    linewidth=2,
                )
                mean_plot = mean_smooth
            else:
                ax.plot(time, mean, "o-", label="Mean")
                mean_plot = mean
        else:
            ax.plot(time, mean, "o-", label="Mean", markersize=4)
            mean_plot = mean

        if show_uncertainty and "std" in timeseries:
            ax.fill_between(
                time,
                mean_plot - std,
                mean_plot + std,
                alpha=0.2,
                label="±1 std",
            )

        if show_n_cells and "n_cells" in timeseries:
            ax2 = ax.twinx()
            ax2.plot(
                time,
                timeseries["n_cells"].values,
                "--",
                color="orange",
                alpha=0.5,
                label="N cells",
            )
            ax2.set_ylabel("Number of cells", color="orange")
            ax2.tick_params(axis="y", labelcolor="orange")

        ax.set_xlabel("Time")
        ax.set_ylabel(style_kwargs.get("ylabel", "Value"))
        ax.set_title(style_kwargs.get("title", "Timeseries"))
        ax.legend(loc="best")
        ax.grid(True, alpha=0.3)
        fig.tight_layout()
        return fig, ax

    def plot_diurnal_cycle(
        self,
        diurnal: xr.Dataset,
        show_confidence: bool = True,
        ax: plt.Axes | None = None,
        **style_kwargs: Any,
    ) -> tuple[plt.Figure, plt.Axes]:
        """Plot a clock-time diurnal cycle.

        Parameters
        ----------
        diurnal : xr.Dataset
            Output of :meth:`compute_diurnal_cycle`.
        show_confidence : bool, optional
            Draw ±1 std band.
        ax : plt.Axes or None, optional
            Axes to draw on; created if ``None``.
        **style_kwargs
            ``ylabel``, ``title``, ``figsize``.

        Returns
        -------
        fig, ax : plt.Figure, plt.Axes

        """
        if ax is None:
            figsize = style_kwargs.pop("figsize", (10, 6))
            fig, ax = plt.subplots(figsize=figsize)
        else:
            fig = ax.figure

        hours = diurnal["hour"].values
        mean = diurnal["mean"].values
        std = diurnal["std"].values

        ax.plot(hours, mean, "o-", linewidth=2, markersize=6, label="Mean")
        if show_confidence:
            ax.fill_between(
                hours,
                mean - std,
                mean + std,
                alpha=0.2,
                label="±1 std",
            )

        ax.set_xlabel("Hour of Day")
        ax.set_ylabel(style_kwargs.get("ylabel", "Value"))
        ax.set_title(style_kwargs.get("title", "Diurnal Cycle"))
        ax.set_xlim(0, 24)
        ax.set_xticks(np.arange(0, 25, 3))
        ax.legend(loc="best")
        ax.grid(True, alpha=0.3)
        fig.tight_layout()
        return fig, ax

    def plot_diurnal_cycle_comparison(
        self,
        diurnal_clock: xr.Dataset,
        diurnal_solar: xr.Dataset,
        figsize: tuple[float, float] = (14, 6),
        **style_kwargs: Any,
    ) -> tuple[plt.Figure, np.ndarray]:
        """Side-by-side clock-time vs solar-time diurnal cycle plots.

        Parameters
        ----------
        diurnal_clock : xr.Dataset
            Output of :meth:`compute_diurnal_cycle`.
        diurnal_solar : xr.Dataset
            Output of :meth:`compute_diurnal_cycle_solar`.
        figsize : tuple, optional
            Figure size.
        **style_kwargs
            ``ylabel``, ``title``.

        Returns
        -------
        fig, axes : plt.Figure, np.ndarray of plt.Axes

        """
        fig, axes = plt.subplots(1, 2, figsize=figsize)

        # Clock-time panel
        hours = diurnal_clock["hour"].values
        mean_c = diurnal_clock["mean"].values
        std_c = diurnal_clock["std"].values

        axes[0].plot(hours, mean_c, "o-", linewidth=2, markersize=6)
        axes[0].fill_between(hours, mean_c - std_c, mean_c + std_c, alpha=0.2)
        axes[0].set_xlabel("Hour of Day")
        axes[0].set_ylabel(style_kwargs.get("ylabel", "Value"))
        axes[0].set_title("Clock-Time Diurnal Cycle")
        axes[0].set_xlim(0, 24)
        axes[0].set_xticks(np.arange(0, 25, 3))
        axes[0].grid(True, alpha=0.3)

        # Solar-elevation panel
        solar_elev = diurnal_solar["solar_elevation"].values
        mean_s = diurnal_solar["mean"].values
        std_s = diurnal_solar["std"].values

        axes[1].plot(
            solar_elev,
            mean_s,
            "o-",
            linewidth=2,
            markersize=6,
            color="orange",
        )
        axes[1].fill_between(
            solar_elev,
            mean_s - std_s,
            mean_s + std_s,
            alpha=0.2,
            color="orange",
        )
        axes[1].axvline(0, color="k", linestyle="--", alpha=0.3, label="Horizon")
        axes[1].set_xlabel("Solar Elevation (°)")
        axes[1].set_ylabel(style_kwargs.get("ylabel", "Value"))
        axes[1].set_title("Solar-Time Diurnal Cycle")
        axes[1].grid(True, alpha=0.3)
        axes[1].legend()

        fig.suptitle(
            style_kwargs.get("title", "Diurnal Cycle Comparison"), fontsize=14, y=1.02
        )
        fig.tight_layout()
        return fig, axes

__init__(vod_ds, grid, grid_name, site_lat=None, site_lon=None, site_elevation=0.0)

Initialize the temporal analysis helper.

Parameters

vod_ds : xr.Dataset VOD dataset containing cell IDs. grid : GridData Grid instance. grid_name : str Grid name suffix for cell IDs. site_lat : float | None, optional Site latitude in degrees. site_lon : float | None, optional Site longitude in degrees. site_elevation : float, default 0.0 Site elevation in metres.

Source code in packages/canvod-grids/src/canvod/grids/analysis/temporal.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def __init__(
    self,
    vod_ds: xr.Dataset,
    grid: GridData,
    grid_name: str,
    site_lat: float | None = None,
    site_lon: float | None = None,
    site_elevation: float = 0.0,
) -> None:
    """Initialize the temporal analysis helper.

    Parameters
    ----------
    vod_ds : xr.Dataset
        VOD dataset containing cell IDs.
    grid : GridData
        Grid instance.
    grid_name : str
        Grid name suffix for cell IDs.
    site_lat : float | None, optional
        Site latitude in degrees.
    site_lon : float | None, optional
        Site longitude in degrees.
    site_elevation : float, default 0.0
        Site elevation in metres.

    """
    self.vod_ds = vod_ds
    self.grid = grid
    self.grid_name = grid_name
    self.cell_id_var = f"cell_id_{grid_name}"

    # Solar calculator (optional)
    if site_lat is not None and site_lon is not None:
        self.solar_calc: SolarPositionCalculator | None = SolarPositionCalculator(
            lat=site_lat, lon=site_lon, elevation=site_elevation
        )
        logger.info(
            "solar calculator enabled for (%.4f°, %.4f°)",
            site_lat,
            site_lon,
        )
    else:
        self.solar_calc = None

    # Validate dataset
    if self.cell_id_var not in vod_ds:
        available = [v for v in vod_ds.data_vars if v.startswith("cell_id_")]
        raise ValueError(
            f"Cell ID variable '{self.cell_id_var}' not found in dataset. "
            f"Available: {available}"
        )

compute_timeseries(var_name='VOD', spatial_mask=None, weights=None, aggregate='1D', min_cells=1)

Compute a weighted time-series aggregated over space.

Parameters

var_name : str, optional Data variable to aggregate. spatial_mask : np.ndarray or None, optional Boolean mask of shape (grid.ncells,); True = include. weights : np.ndarray or None, optional Cell weights of shape (grid.ncells,); normalised internally. None → uniform weights. aggregate : str, optional Pandas-compatible frequency string for temporal resampling. min_cells : int, optional Minimum unique cells required per time bin.

Returns

xr.Dataset Variables: mean, std, n_cells, n_observations, sum_weights.

Raises

ValueError If var_name is missing or mask/weight shapes are wrong.

Source code in packages/canvod-grids/src/canvod/grids/analysis/temporal.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
def compute_timeseries(
    self,
    var_name: str = "VOD",
    spatial_mask: np.ndarray | None = None,
    weights: np.ndarray | None = None,
    aggregate: str = "1D",
    min_cells: int = 1,
) -> xr.Dataset:
    """Compute a weighted time-series aggregated over space.

    Parameters
    ----------
    var_name : str, optional
        Data variable to aggregate.
    spatial_mask : np.ndarray or None, optional
        Boolean mask of shape ``(grid.ncells,)``; ``True`` = include.
    weights : np.ndarray or None, optional
        Cell weights of shape ``(grid.ncells,)``; normalised
        internally.  ``None`` → uniform weights.
    aggregate : str, optional
        Pandas-compatible frequency string for temporal resampling.
    min_cells : int, optional
        Minimum unique cells required per time bin.

    Returns
    -------
    xr.Dataset
        Variables: ``mean``, ``std``, ``n_cells``,
        ``n_observations``, ``sum_weights``.

    Raises
    ------
    ValueError
        If *var_name* is missing or mask/weight shapes are wrong.

    """
    if var_name not in self.vod_ds:
        raise ValueError(f"Variable '{var_name}' not found in dataset")

    logger.info(
        "compute_timeseries: var=%s aggregate=%s mask=%s weights=%s",
        var_name,
        aggregate,
        spatial_mask is not None,
        weights is not None,
    )

    var_data = self.vod_ds[var_name]
    cell_ids = self.vod_ds[self.cell_id_var]

    # Apply spatial mask
    if spatial_mask is not None:
        if spatial_mask.shape != (self.grid.ncells,):
            raise ValueError(
                f"Spatial mask shape {spatial_mask.shape} doesn't match "
                f"grid size ({self.grid.ncells},)"
            )
        data_mask = xr.zeros_like(cell_ids, dtype=bool)
        for cell_id in np.where(spatial_mask)[0]:
            data_mask = data_mask | (cell_ids == cell_id)
        var_data = var_data.where(data_mask)
        logger.debug(
            "spatial mask applied: %d/%d cells",
            spatial_mask.sum(),
            self.grid.ncells,
        )

    # Prepare weights
    weights = self._prepare_weights(weights)

    return self._compute_weighted_timeseries(
        var_data, cell_ids, weights, aggregate, min_cells
    )

compute_timeseries_solar_corrected(var_name='VOD', spatial_mask=None, weights=None, aggregate='1D', min_cells=1, solar_correction='normalize', reference_zenith=45.0, daytime_only=False, twilight_angle=-6.0)

Compute a time-series after applying a solar correction.

Parameters

var_name : str, optional Data variable to correct and aggregate. spatial_mask : np.ndarray or None, optional Cell selection mask. weights : np.ndarray or None, optional Cell weights. aggregate : str, optional Temporal resampling frequency. min_cells : int, optional Minimum cells per time bin. solar_correction : {'normalize', 'residual', 'cos_correction'} Correction method passed to :meth:SolarPositionCalculator.apply_solar_correction. reference_zenith : float, optional Reference zenith for normalisation (degrees). daytime_only : bool, optional If True, mask out nighttime epochs. twilight_angle : float, optional Solar-elevation threshold for daytime (degrees).

Returns

xr.Dataset Solar-corrected time-series with additional metadata attrs.

Raises

ValueError If no solar calculator is configured.

Source code in packages/canvod-grids/src/canvod/grids/analysis/temporal.py
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
def compute_timeseries_solar_corrected(
    self,
    var_name: str = "VOD",
    spatial_mask: np.ndarray | None = None,
    weights: np.ndarray | None = None,
    aggregate: str = "1D",
    min_cells: int = 1,
    solar_correction: Literal[
        "normalize", "residual", "cos_correction"
    ] = "normalize",
    reference_zenith: float = 45.0,
    daytime_only: bool = False,
    twilight_angle: float = -6.0,
) -> xr.Dataset:
    """Compute a time-series after applying a solar correction.

    Parameters
    ----------
    var_name : str, optional
        Data variable to correct and aggregate.
    spatial_mask : np.ndarray or None, optional
        Cell selection mask.
    weights : np.ndarray or None, optional
        Cell weights.
    aggregate : str, optional
        Temporal resampling frequency.
    min_cells : int, optional
        Minimum cells per time bin.
    solar_correction : {'normalize', 'residual', 'cos_correction'}
        Correction method passed to
        :meth:`SolarPositionCalculator.apply_solar_correction`.
    reference_zenith : float, optional
        Reference zenith for normalisation (degrees).
    daytime_only : bool, optional
        If ``True``, mask out nighttime epochs.
    twilight_angle : float, optional
        Solar-elevation threshold for daytime (degrees).

    Returns
    -------
    xr.Dataset
        Solar-corrected time-series with additional metadata attrs.

    Raises
    ------
    ValueError
        If no solar calculator is configured.

    """
    if self.solar_calc is None:
        raise ValueError(
            "Solar calculator not initialized. "
            "Provide site_lat and site_lon to TemporalAnalysis constructor."
        )

    logger.info(
        "solar-corrected timeseries: correction=%s daytime_only=%s",
        solar_correction,
        daytime_only,
    )

    var_data = self.vod_ds[var_name]

    # Apply solar correction
    var_data_corrected = self.solar_calc.apply_solar_correction(
        var_data, method=solar_correction, reference_zenith=reference_zenith
    )

    # Daytime filter
    if daytime_only:
        times = pd.to_datetime(var_data["epoch"].values)
        is_day = self.solar_calc.is_daytime(times, twilight_angle)
        is_day_da = xr.DataArray(
            is_day, coords={"epoch": var_data_corrected["epoch"]}, dims=["epoch"]
        )
        var_data_corrected = var_data_corrected.where(is_day_da)
        logger.debug(
            "daytime filter: %d/%d timesteps kept",
            is_day.sum(),
            len(is_day),
        )

    # Temporary dataset with corrected variable
    corrected_name = f"{var_name}_solar_corrected"
    ds_temp = self.vod_ds.copy()
    ds_temp[corrected_name] = var_data_corrected

    # Reuse compute_timeseries via a lightweight temporary instance
    analysis_temp = TemporalAnalysis.__new__(TemporalAnalysis)
    analysis_temp.vod_ds = ds_temp
    analysis_temp.grid = self.grid
    analysis_temp.grid_name = self.grid_name
    analysis_temp.cell_id_var = self.cell_id_var
    analysis_temp.solar_calc = self.solar_calc

    ts = analysis_temp.compute_timeseries(
        var_name=corrected_name,
        spatial_mask=spatial_mask,
        weights=weights,
        aggregate=aggregate,
        min_cells=min_cells,
    )

    # Solar metadata
    ts.attrs["solar_correction"] = solar_correction
    ts.attrs["reference_zenith"] = reference_zenith
    ts.attrs["daytime_only"] = daytime_only
    if daytime_only:
        ts.attrs["twilight_angle"] = twilight_angle

    return ts

compute_diurnal_cycle(var_name='VOD', spatial_mask=None, weights=None, hour_bins=24, min_observations=10)

Compute clock-time diurnal cycle (hour-of-day statistics).

Parameters

var_name : str, optional Data variable to bin. spatial_mask : np.ndarray or None, optional Cell selection mask. weights : np.ndarray or None, optional Cell weights. hour_bins : int, optional Number of equal-width hour bins over [0, 24). min_observations : int, optional Minimum observations required per bin.

Returns

xr.Dataset mean, std, n_observations on the hour coordinate.

Source code in packages/canvod-grids/src/canvod/grids/analysis/temporal.py
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
def compute_diurnal_cycle(
    self,
    var_name: str = "VOD",
    spatial_mask: np.ndarray | None = None,
    weights: np.ndarray | None = None,
    hour_bins: int = 24,
    min_observations: int = 10,
) -> xr.Dataset:
    """Compute clock-time diurnal cycle (hour-of-day statistics).

    Parameters
    ----------
    var_name : str, optional
        Data variable to bin.
    spatial_mask : np.ndarray or None, optional
        Cell selection mask.
    weights : np.ndarray or None, optional
        Cell weights.
    hour_bins : int, optional
        Number of equal-width hour bins over [0, 24).
    min_observations : int, optional
        Minimum observations required per bin.

    Returns
    -------
    xr.Dataset
        ``mean``, ``std``, ``n_observations`` on the ``hour``
        coordinate.

    """
    if var_name not in self.vod_ds:
        raise ValueError(f"Variable '{var_name}' not found in dataset")

    logger.info("compute_diurnal_cycle: var=%s hour_bins=%d", var_name, hour_bins)

    var_data = self.vod_ds[var_name]
    cell_ids = self.vod_ds[self.cell_id_var]

    # Spatial mask
    if spatial_mask is not None:
        data_mask = xr.zeros_like(cell_ids, dtype=bool)
        for cell_id in np.where(spatial_mask)[0]:
            data_mask = data_mask | (cell_ids == cell_id)
        var_data = var_data.where(data_mask)

    weights = self._prepare_weights(weights)

    # Hour of day (fractional)
    times = pd.to_datetime(var_data["epoch"].values)
    hours = times.hour + times.minute / 60.0

    n_sid = var_data.sizes.get("sid", 1)
    hour_edges = np.linspace(0, 24, hour_bins + 1)
    hour_centers = (hour_edges[:-1] + hour_edges[1:]) / 2

    # Flatten
    values = var_data.values.ravel()
    cells = cell_ids.values.ravel()
    hours_flat = np.repeat(hours, n_sid)

    valid = np.isfinite(values) & np.isfinite(cells)
    df = pd.DataFrame(
        {
            "hour": hours_flat[valid],
            "value": values[valid],
            "cell_id": cells[valid].astype(int),
        }
    )
    df["weight"] = df["cell_id"].map(
        lambda cid: weights[cid] if cid < len(weights) else 0.0
    )
    df["hour_bin"] = pd.cut(
        df["hour"], bins=hour_edges, labels=hour_centers, include_lowest=True
    )

    grouped = df.groupby("hour_bin")

    means, stds, n_obs = [], [], []
    for hc in hour_centers:
        if hc in grouped.groups:
            group = grouped.get_group(hc)
            if len(group) >= min_observations:
                w = group["weight"].values
                v = group["value"].values
                if w.sum() > 0:
                    wm = np.average(v, weights=w)
                    ws = np.sqrt(np.average((v - wm) ** 2, weights=w))
                else:
                    wm, ws = np.nan, np.nan
                means.append(wm)
                stds.append(ws)
                n_obs.append(len(group))
            else:
                means.append(np.nan)
                stds.append(np.nan)
                n_obs.append(0)
        else:
            means.append(np.nan)
            stds.append(np.nan)
            n_obs.append(0)

    ds = xr.Dataset(
        {
            "mean": ("hour", np.array(means)),
            "std": ("hour", np.array(stds)),
            "n_observations": ("hour", np.array(n_obs)),
        },
        coords={"hour": hour_centers},
    )
    ds.attrs.update(
        {
            "variable": var_name,
            "grid": self.grid_name,
            "hour_bins": hour_bins,
            "min_observations": min_observations,
        }
    )
    logger.info(
        "diurnal cycle: %d bins, mean n_obs=%.1f",
        hour_bins,
        np.nanmean(n_obs),
    )
    return ds

compute_diurnal_cycle_solar(var_name='VOD', spatial_mask=None, weights=None, n_solar_bins=12, min_observations=10)

Diurnal cycle binned by solar elevation instead of clock time.

Accounts for seasonal variation in solar position, producing a more physically meaningful diurnal pattern.

Parameters

var_name : str, optional Data variable to bin. spatial_mask : np.ndarray or None, optional Cell selection mask. weights : np.ndarray or None, optional Cell weights. n_solar_bins : int, optional Number of equal-width bins over [-20°, 90°]. min_observations : int, optional Minimum observations per bin.

Returns

xr.Dataset mean, std, n_observations on the solar_elevation coordinate.

Raises

ValueError If no solar calculator is configured.

Source code in packages/canvod-grids/src/canvod/grids/analysis/temporal.py
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
def compute_diurnal_cycle_solar(
    self,
    var_name: str = "VOD",
    spatial_mask: np.ndarray | None = None,
    weights: np.ndarray | None = None,
    n_solar_bins: int = 12,
    min_observations: int = 10,
) -> xr.Dataset:
    """Diurnal cycle binned by solar elevation instead of clock time.

    Accounts for seasonal variation in solar position, producing a
    more physically meaningful diurnal pattern.

    Parameters
    ----------
    var_name : str, optional
        Data variable to bin.
    spatial_mask : np.ndarray or None, optional
        Cell selection mask.
    weights : np.ndarray or None, optional
        Cell weights.
    n_solar_bins : int, optional
        Number of equal-width bins over [-20°, 90°].
    min_observations : int, optional
        Minimum observations per bin.

    Returns
    -------
    xr.Dataset
        ``mean``, ``std``, ``n_observations`` on the
        ``solar_elevation`` coordinate.

    Raises
    ------
    ValueError
        If no solar calculator is configured.

    """
    if self.solar_calc is None:
        raise ValueError(
            "Solar calculator not initialized. "
            "Provide site_lat and site_lon to TemporalAnalysis constructor."
        )

    logger.info("solar-binned diurnal cycle: n_bins=%d", n_solar_bins)

    var_data = self.vod_ds[var_name]
    cell_ids = self.vod_ds[self.cell_id_var]

    # Spatial mask
    if spatial_mask is not None:
        data_mask = xr.zeros_like(cell_ids, dtype=bool)
        for cell_id in np.where(spatial_mask)[0]:
            data_mask = data_mask | (cell_ids == cell_id)
        var_data = var_data.where(data_mask)

    weights = self._prepare_weights(weights)

    # Solar bins per epoch
    times = pd.to_datetime(var_data["epoch"].values)
    solar_bins = self.solar_calc.compute_solar_bins(times, n_bins=n_solar_bins)
    bin_edges = np.linspace(-20, 90, n_solar_bins + 1)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

    n_sid = var_data.sizes.get("sid", 1)

    # Flatten
    values = var_data.values.ravel()
    cells = cell_ids.values.ravel()
    bins_flat = np.repeat(solar_bins, n_sid)

    valid = np.isfinite(values) & np.isfinite(cells)
    df = pd.DataFrame(
        {
            "solar_bin": bins_flat[valid],
            "value": values[valid],
            "cell_id": cells[valid].astype(int),
        }
    )
    df["weight"] = df["cell_id"].map(
        lambda cid: weights[cid] if cid < len(weights) else 0.0
    )

    grouped = df.groupby("solar_bin")

    means, stds, n_obs = [], [], []
    for bin_idx in range(n_solar_bins):
        if bin_idx in grouped.groups:
            group = grouped.get_group(bin_idx)
            if len(group) >= min_observations:
                w = group["weight"].values
                v = group["value"].values
                if w.sum() > 0:
                    wm = np.average(v, weights=w)
                    ws = np.sqrt(np.average((v - wm) ** 2, weights=w))
                else:
                    wm, ws = np.nan, np.nan
                means.append(wm)
                stds.append(ws)
                n_obs.append(len(group))
            else:
                means.append(np.nan)
                stds.append(np.nan)
                n_obs.append(0)
        else:
            means.append(np.nan)
            stds.append(np.nan)
            n_obs.append(0)

    ds = xr.Dataset(
        {
            "mean": ("solar_elevation", np.array(means)),
            "std": ("solar_elevation", np.array(stds)),
            "n_observations": ("solar_elevation", np.array(n_obs)),
        },
        coords={"solar_elevation": bin_centers},
    )
    ds.attrs.update(
        {
            "variable": var_name,
            "grid": self.grid_name,
            "n_solar_bins": n_solar_bins,
            "min_observations": min_observations,
            "coordinate_type": "solar_elevation",
        }
    )
    logger.info(
        "solar-binned diurnal: %d bins, mean n_obs=%.1f",
        n_solar_bins,
        np.nanmean(n_obs),
    )
    return ds

add_solar_metadata_to_timeseries(timeseries)

Attach solar zenith, azimuth and elevation to a time-series.

Parameters

timeseries : xr.Dataset Time-series dataset with an epoch coordinate.

Returns

xr.Dataset Copy with solar_zenith, solar_azimuth, solar_elevation added.

Raises

ValueError If no solar calculator is configured.

Source code in packages/canvod-grids/src/canvod/grids/analysis/temporal.py
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
def add_solar_metadata_to_timeseries(self, timeseries: xr.Dataset) -> xr.Dataset:
    """Attach solar zenith, azimuth and elevation to a time-series.

    Parameters
    ----------
    timeseries : xr.Dataset
        Time-series dataset with an ``epoch`` coordinate.

    Returns
    -------
    xr.Dataset
        Copy with ``solar_zenith``, ``solar_azimuth``,
        ``solar_elevation`` added.

    Raises
    ------
    ValueError
        If no solar calculator is configured.

    """
    if self.solar_calc is None:
        raise ValueError("Solar calculator not initialized")

    times = pd.to_datetime(timeseries["epoch"].values)
    zenith, azimuth = self.solar_calc.compute_solar_position(times)
    elevation = 90 - zenith

    ts_solar = timeseries.copy()
    ts_solar["solar_zenith"] = ("epoch", zenith)
    ts_solar["solar_azimuth"] = ("epoch", azimuth)
    ts_solar["solar_elevation"] = ("epoch", elevation)

    ts_solar["solar_zenith"].attrs = {
        "units": "degrees",
        "description": "Solar zenith angle (0° = overhead)",
    }
    ts_solar["solar_azimuth"].attrs = {
        "units": "degrees",
        "description": "Solar azimuth angle (0° = North, 90° = East)",
    }
    ts_solar["solar_elevation"].attrs = {
        "units": "degrees",
        "description": "Solar elevation angle (0° = horizon, 90° = overhead)",
    }
    return ts_solar

plot_timeseries(timeseries, smooth_window=0, show_uncertainty=True, show_n_cells=False, ax=None, **style_kwargs)

Plot a time-series with optional Savitzky-Golay smoothing.

Parameters

timeseries : xr.Dataset Output of :meth:compute_timeseries. smooth_window : int, optional Savitzky-Golay window length (0 = off; forced odd internally). show_uncertainty : bool, optional Draw ±1 std band. show_n_cells : bool, optional Secondary y-axis showing cell count. ax : plt.Axes or None, optional Axes to draw on; created if None. **style_kwargs ylabel, title, figsize forwarded to matplotlib.

Returns

fig, ax : plt.Figure, plt.Axes

Source code in packages/canvod-grids/src/canvod/grids/analysis/temporal.py
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
def plot_timeseries(
    self,
    timeseries: xr.Dataset,
    smooth_window: int = 0,
    show_uncertainty: bool = True,
    show_n_cells: bool = False,
    ax: plt.Axes | None = None,
    **style_kwargs: Any,
) -> tuple[plt.Figure, plt.Axes]:
    """Plot a time-series with optional Savitzky-Golay smoothing.

    Parameters
    ----------
    timeseries : xr.Dataset
        Output of :meth:`compute_timeseries`.
    smooth_window : int, optional
        Savitzky-Golay window length (0 = off; forced odd internally).
    show_uncertainty : bool, optional
        Draw ±1 std band.
    show_n_cells : bool, optional
        Secondary y-axis showing cell count.
    ax : plt.Axes or None, optional
        Axes to draw on; created if ``None``.
    **style_kwargs
        ``ylabel``, ``title``, ``figsize`` forwarded to matplotlib.

    Returns
    -------
    fig, ax : plt.Figure, plt.Axes

    """
    if ax is None:
        figsize = style_kwargs.pop("figsize", (12, 6))
        fig, ax = plt.subplots(figsize=figsize)
    else:
        fig = ax.figure

    time = timeseries["epoch"].values
    mean = timeseries["mean"].values
    std = timeseries["std"].values

    if smooth_window > 0:
        if smooth_window % 2 == 0:
            smooth_window += 1
        valid = np.isfinite(mean)
        if np.sum(valid) > smooth_window:
            mean_smooth = mean.copy()
            mean_smooth[valid] = savgol_filter(
                mean[valid], smooth_window, polyorder=2
            )
            ax.plot(
                time,
                mean,
                "o",
                alpha=0.3,
                label="Raw",
                markersize=3,
                color="gray",
            )
            ax.plot(
                time,
                mean_smooth,
                "-",
                label=f"Smoothed (window={smooth_window})",
                linewidth=2,
            )
            mean_plot = mean_smooth
        else:
            ax.plot(time, mean, "o-", label="Mean")
            mean_plot = mean
    else:
        ax.plot(time, mean, "o-", label="Mean", markersize=4)
        mean_plot = mean

    if show_uncertainty and "std" in timeseries:
        ax.fill_between(
            time,
            mean_plot - std,
            mean_plot + std,
            alpha=0.2,
            label="±1 std",
        )

    if show_n_cells and "n_cells" in timeseries:
        ax2 = ax.twinx()
        ax2.plot(
            time,
            timeseries["n_cells"].values,
            "--",
            color="orange",
            alpha=0.5,
            label="N cells",
        )
        ax2.set_ylabel("Number of cells", color="orange")
        ax2.tick_params(axis="y", labelcolor="orange")

    ax.set_xlabel("Time")
    ax.set_ylabel(style_kwargs.get("ylabel", "Value"))
    ax.set_title(style_kwargs.get("title", "Timeseries"))
    ax.legend(loc="best")
    ax.grid(True, alpha=0.3)
    fig.tight_layout()
    return fig, ax

plot_diurnal_cycle(diurnal, show_confidence=True, ax=None, **style_kwargs)

Plot a clock-time diurnal cycle.

Parameters

diurnal : xr.Dataset Output of :meth:compute_diurnal_cycle. show_confidence : bool, optional Draw ±1 std band. ax : plt.Axes or None, optional Axes to draw on; created if None. **style_kwargs ylabel, title, figsize.

Returns

fig, ax : plt.Figure, plt.Axes

Source code in packages/canvod-grids/src/canvod/grids/analysis/temporal.py
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
def plot_diurnal_cycle(
    self,
    diurnal: xr.Dataset,
    show_confidence: bool = True,
    ax: plt.Axes | None = None,
    **style_kwargs: Any,
) -> tuple[plt.Figure, plt.Axes]:
    """Plot a clock-time diurnal cycle.

    Parameters
    ----------
    diurnal : xr.Dataset
        Output of :meth:`compute_diurnal_cycle`.
    show_confidence : bool, optional
        Draw ±1 std band.
    ax : plt.Axes or None, optional
        Axes to draw on; created if ``None``.
    **style_kwargs
        ``ylabel``, ``title``, ``figsize``.

    Returns
    -------
    fig, ax : plt.Figure, plt.Axes

    """
    if ax is None:
        figsize = style_kwargs.pop("figsize", (10, 6))
        fig, ax = plt.subplots(figsize=figsize)
    else:
        fig = ax.figure

    hours = diurnal["hour"].values
    mean = diurnal["mean"].values
    std = diurnal["std"].values

    ax.plot(hours, mean, "o-", linewidth=2, markersize=6, label="Mean")
    if show_confidence:
        ax.fill_between(
            hours,
            mean - std,
            mean + std,
            alpha=0.2,
            label="±1 std",
        )

    ax.set_xlabel("Hour of Day")
    ax.set_ylabel(style_kwargs.get("ylabel", "Value"))
    ax.set_title(style_kwargs.get("title", "Diurnal Cycle"))
    ax.set_xlim(0, 24)
    ax.set_xticks(np.arange(0, 25, 3))
    ax.legend(loc="best")
    ax.grid(True, alpha=0.3)
    fig.tight_layout()
    return fig, ax

plot_diurnal_cycle_comparison(diurnal_clock, diurnal_solar, figsize=(14, 6), **style_kwargs)

Side-by-side clock-time vs solar-time diurnal cycle plots.

Parameters

diurnal_clock : xr.Dataset Output of :meth:compute_diurnal_cycle. diurnal_solar : xr.Dataset Output of :meth:compute_diurnal_cycle_solar. figsize : tuple, optional Figure size. **style_kwargs ylabel, title.

Returns

fig, axes : plt.Figure, np.ndarray of plt.Axes

Source code in packages/canvod-grids/src/canvod/grids/analysis/temporal.py
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
def plot_diurnal_cycle_comparison(
    self,
    diurnal_clock: xr.Dataset,
    diurnal_solar: xr.Dataset,
    figsize: tuple[float, float] = (14, 6),
    **style_kwargs: Any,
) -> tuple[plt.Figure, np.ndarray]:
    """Side-by-side clock-time vs solar-time diurnal cycle plots.

    Parameters
    ----------
    diurnal_clock : xr.Dataset
        Output of :meth:`compute_diurnal_cycle`.
    diurnal_solar : xr.Dataset
        Output of :meth:`compute_diurnal_cycle_solar`.
    figsize : tuple, optional
        Figure size.
    **style_kwargs
        ``ylabel``, ``title``.

    Returns
    -------
    fig, axes : plt.Figure, np.ndarray of plt.Axes

    """
    fig, axes = plt.subplots(1, 2, figsize=figsize)

    # Clock-time panel
    hours = diurnal_clock["hour"].values
    mean_c = diurnal_clock["mean"].values
    std_c = diurnal_clock["std"].values

    axes[0].plot(hours, mean_c, "o-", linewidth=2, markersize=6)
    axes[0].fill_between(hours, mean_c - std_c, mean_c + std_c, alpha=0.2)
    axes[0].set_xlabel("Hour of Day")
    axes[0].set_ylabel(style_kwargs.get("ylabel", "Value"))
    axes[0].set_title("Clock-Time Diurnal Cycle")
    axes[0].set_xlim(0, 24)
    axes[0].set_xticks(np.arange(0, 25, 3))
    axes[0].grid(True, alpha=0.3)

    # Solar-elevation panel
    solar_elev = diurnal_solar["solar_elevation"].values
    mean_s = diurnal_solar["mean"].values
    std_s = diurnal_solar["std"].values

    axes[1].plot(
        solar_elev,
        mean_s,
        "o-",
        linewidth=2,
        markersize=6,
        color="orange",
    )
    axes[1].fill_between(
        solar_elev,
        mean_s - std_s,
        mean_s + std_s,
        alpha=0.2,
        color="orange",
    )
    axes[1].axvline(0, color="k", linestyle="--", alpha=0.3, label="Horizon")
    axes[1].set_xlabel("Solar Elevation (°)")
    axes[1].set_ylabel(style_kwargs.get("ylabel", "Value"))
    axes[1].set_title("Solar-Time Diurnal Cycle")
    axes[1].grid(True, alpha=0.3)
    axes[1].legend()

    fig.suptitle(
        style_kwargs.get("title", "Diurnal Cycle Comparison"), fontsize=14, y=1.02
    )
    fig.tight_layout()
    return fig, axes

Spatial Analysis

Spatial analysis of gridded VOD data.

Per-cell statistical aggregation and basic comparative plotting for VOD datasets with pre-assigned cell IDs.

Classes

VODSpatialAnalyzer Computes per-cell temporal statistics and provides simple histogram-based comparisons between filtering variants.

Notes

  • Spatial visualisation (hemisphere maps, 3-D projections) is handled by the canvod-viz package. This module is limited to the statistical aggregation and lightweight comparison plots that do not require a hemisphere renderer.
  • compute_spatial_statistics returns both a grid-aligned array (length grid.ncells, NaN for empty cells) and a compact patch-aligned array containing only cells with observations. Use the grid-aligned form for masking / weighting; use the patch-aligned form when passing data to canvod-viz renderers.

VODSpatialAnalyzer

Per-cell spatial analysis of a VOD dataset.

Parameters

vod_data : xr.Dataset Dataset with a cell_id_<grid_name> variable already assigned. grid : GridData Grid instance (must expose .ncells). grid_name : str, optional Suffix for the cell-ID variable.

Raises

ValueError If the expected cell-ID variable is missing.

Source code in packages/canvod-grids/src/canvod/grids/analysis/spatial.py
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
class VODSpatialAnalyzer:
    """Per-cell spatial analysis of a VOD dataset.

    Parameters
    ----------
    vod_data : xr.Dataset
        Dataset with a ``cell_id_<grid_name>`` variable already
        assigned.
    grid : GridData
        Grid instance (must expose ``.ncells``).
    grid_name : str, optional
        Suffix for the cell-ID variable.

    Raises
    ------
    ValueError
        If the expected cell-ID variable is missing.

    """

    def __init__(
        self,
        vod_data: xr.Dataset,
        grid: GridData,
        grid_name: str = "equal_area_2deg",
    ) -> None:
        """Initialize the spatial analyzer.

        Parameters
        ----------
        vod_data : xr.Dataset
            Dataset with cell IDs.
        grid : GridData
            Grid instance.
        grid_name : str, default "equal_area_2deg"
            Grid name suffix for cell IDs.

        """
        self.vod_data = vod_data
        self.grid = grid
        self.grid_name = grid_name
        self.cell_id_var = f"cell_id_{grid_name}"

        if self.cell_id_var not in vod_data:
            available = [v for v in vod_data.data_vars if v.startswith("cell_id_")]
            raise ValueError(
                f"Cell ID variable '{self.cell_id_var}' not found. "
                f"Available: {available}"
            )

        logger.info(
            "VODSpatialAnalyzer: grid=%s ncells=%d shape=%s",
            grid_name,
            grid.ncells,
            dict(vod_data.sizes),
        )

    def compute_spatial_statistics(
        self,
        var_name: str = "VOD",
        time_agg: str = "mean",
    ) -> dict:
        """Compute per-cell temporal statistics.

        Parameters
        ----------
        var_name : str, optional
            Data variable to aggregate over time.
        time_agg : {'mean', 'std', 'count', 'median'}
            Aggregation function applied per cell.

        Returns
        -------
        dict
            Keys:

            ``grid_aligned`` : np.ndarray
                Shape ``(grid.ncells,)``.  NaN for cells without data.
                Use for masking, weighting, or any grid-indexed operation.
            ``patch_aligned`` : np.ndarray
                Compact array containing only cells with observations.
                Use when passing data to ``canvod-viz`` renderers.
            ``cell_ids_with_data`` : np.ndarray
                Integer cell IDs corresponding to ``patch_aligned``.
            ``metadata`` : dict
                ``valid_cells``, ``total_cells``, ``coverage_percent``,
                ``variable``, ``aggregation``.

        Raises
        ------
        ValueError
            If *var_name* is not in the dataset or *time_agg* is
            unrecognised.

        """
        if var_name not in self.vod_data:
            raise ValueError(
                f"Variable '{var_name}' not found. "
                f"Available: {list(self.vod_data.data_vars)}"
            )

        _AGG_FUNCS = {"mean", "std", "count", "median"}
        if time_agg not in _AGG_FUNCS:
            raise ValueError(
                f"Unknown aggregation '{time_agg}'; expected one of {_AGG_FUNCS}"
            )

        logger.info("spatial statistics: var=%s agg=%s", var_name, time_agg)

        var_data = self.vod_data[var_name]
        cell_ids = self.vod_data[self.cell_id_var]

        df = pd.DataFrame(
            {
                "cell_id": cell_ids.values.ravel(),
                "value": var_data.values.ravel(),
            }
        )
        df = df.dropna()

        if len(df) == 0:
            logger.warning("no valid data after removing NaN values")
            return {
                "grid_aligned": np.full(self.grid.ncells, np.nan),
                "patch_aligned": np.array([]),
                "cell_ids_with_data": np.array([], dtype=int),
                "metadata": {
                    "valid_cells": 0,
                    "total_cells": self.grid.ncells,
                    "coverage_percent": 0.0,
                    "variable": var_name,
                    "aggregation": time_agg,
                },
            }

        logger.debug(
            "%d valid observations across %d cells",
            len(df),
            df["cell_id"].nunique(),
        )

        cell_stats = getattr(df.groupby("cell_id")["value"], time_agg)()

        grid_aligned = np.full(self.grid.ncells, np.nan)
        patch_aligned: list[float] = []
        cell_ids_with_data: list[int] = []

        for cell_id, value in cell_stats.items():
            cell_id_int = int(cell_id)
            if 0 <= cell_id_int < self.grid.ncells:
                grid_aligned[cell_id_int] = value
                patch_aligned.append(value)
                cell_ids_with_data.append(cell_id_int)

        n_valid = len(patch_aligned)
        logger.info("spatial stats: %d/%d cells with data", n_valid, self.grid.ncells)

        return {
            "grid_aligned": grid_aligned,
            "patch_aligned": np.array(patch_aligned),
            "cell_ids_with_data": np.array(cell_ids_with_data, dtype=int),
            "metadata": {
                "valid_cells": n_valid,
                "total_cells": self.grid.ncells,
                "coverage_percent": (n_valid / self.grid.ncells) * 100,
                "variable": var_name,
                "aggregation": time_agg,
            },
        }

    def compare_filtering_methods(
        self,
        original_var: str = "VOD",
        filtered_var: str = "VOD_filtered",
        figsize: tuple = (16, 6),
        ax: tuple | None = None,
    ) -> tuple[plt.Figure, np.ndarray]:
        """Histogram comparison of original vs filtered VOD distributions.

        Parameters
        ----------
        original_var : str, optional
            Unfiltered variable name.
        filtered_var : str, optional
            Filtered variable name.
        figsize : tuple, optional
            Figure size when *ax* is ``None``.
        ax : tuple of two plt.Axes or None, optional
            Pre-existing axes pair.  Created if ``None``.

        Returns
        -------
        fig, axes : plt.Figure, np.ndarray of plt.Axes

        """
        if ax is None:
            fig, axes = plt.subplots(1, 2, figsize=figsize)
        else:
            axes = ax
            fig = axes[0].figure

        stats_orig = self.compute_spatial_statistics(original_var, "mean")
        stats_filt = self.compute_spatial_statistics(filtered_var, "mean")

        data_orig = stats_orig["grid_aligned"]
        data_filt = stats_filt["grid_aligned"]

        axes[0].hist(
            data_orig[np.isfinite(data_orig)], bins=50, alpha=0.7, label="Original"
        )
        axes[0].set_title("Original VOD Distribution")
        axes[0].set_xlabel("VOD")
        axes[0].set_ylabel("Frequency")

        axes[1].hist(
            data_filt[np.isfinite(data_filt)],
            bins=50,
            alpha=0.7,
            label="Filtered",
            color="green",
        )
        axes[1].set_title("Filtered VOD Distribution")
        axes[1].set_xlabel("VOD")
        axes[1].set_ylabel("Frequency")

        fig.tight_layout()
        return fig, axes

__init__(vod_data, grid, grid_name='equal_area_2deg')

Initialize the spatial analyzer.

Parameters

vod_data : xr.Dataset Dataset with cell IDs. grid : GridData Grid instance. grid_name : str, default "equal_area_2deg" Grid name suffix for cell IDs.

Source code in packages/canvod-grids/src/canvod/grids/analysis/spatial.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def __init__(
    self,
    vod_data: xr.Dataset,
    grid: GridData,
    grid_name: str = "equal_area_2deg",
) -> None:
    """Initialize the spatial analyzer.

    Parameters
    ----------
    vod_data : xr.Dataset
        Dataset with cell IDs.
    grid : GridData
        Grid instance.
    grid_name : str, default "equal_area_2deg"
        Grid name suffix for cell IDs.

    """
    self.vod_data = vod_data
    self.grid = grid
    self.grid_name = grid_name
    self.cell_id_var = f"cell_id_{grid_name}"

    if self.cell_id_var not in vod_data:
        available = [v for v in vod_data.data_vars if v.startswith("cell_id_")]
        raise ValueError(
            f"Cell ID variable '{self.cell_id_var}' not found. "
            f"Available: {available}"
        )

    logger.info(
        "VODSpatialAnalyzer: grid=%s ncells=%d shape=%s",
        grid_name,
        grid.ncells,
        dict(vod_data.sizes),
    )

compute_spatial_statistics(var_name='VOD', time_agg='mean')

Compute per-cell temporal statistics.

Parameters

var_name : str, optional Data variable to aggregate over time. time_agg : {'mean', 'std', 'count', 'median'} Aggregation function applied per cell.

Returns

dict Keys:

``grid_aligned`` : np.ndarray
    Shape ``(grid.ncells,)``.  NaN for cells without data.
    Use for masking, weighting, or any grid-indexed operation.
``patch_aligned`` : np.ndarray
    Compact array containing only cells with observations.
    Use when passing data to ``canvod-viz`` renderers.
``cell_ids_with_data`` : np.ndarray
    Integer cell IDs corresponding to ``patch_aligned``.
``metadata`` : dict
    ``valid_cells``, ``total_cells``, ``coverage_percent``,
    ``variable``, ``aggregation``.
Raises

ValueError If var_name is not in the dataset or time_agg is unrecognised.

Source code in packages/canvod-grids/src/canvod/grids/analysis/spatial.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
def compute_spatial_statistics(
    self,
    var_name: str = "VOD",
    time_agg: str = "mean",
) -> dict:
    """Compute per-cell temporal statistics.

    Parameters
    ----------
    var_name : str, optional
        Data variable to aggregate over time.
    time_agg : {'mean', 'std', 'count', 'median'}
        Aggregation function applied per cell.

    Returns
    -------
    dict
        Keys:

        ``grid_aligned`` : np.ndarray
            Shape ``(grid.ncells,)``.  NaN for cells without data.
            Use for masking, weighting, or any grid-indexed operation.
        ``patch_aligned`` : np.ndarray
            Compact array containing only cells with observations.
            Use when passing data to ``canvod-viz`` renderers.
        ``cell_ids_with_data`` : np.ndarray
            Integer cell IDs corresponding to ``patch_aligned``.
        ``metadata`` : dict
            ``valid_cells``, ``total_cells``, ``coverage_percent``,
            ``variable``, ``aggregation``.

    Raises
    ------
    ValueError
        If *var_name* is not in the dataset or *time_agg* is
        unrecognised.

    """
    if var_name not in self.vod_data:
        raise ValueError(
            f"Variable '{var_name}' not found. "
            f"Available: {list(self.vod_data.data_vars)}"
        )

    _AGG_FUNCS = {"mean", "std", "count", "median"}
    if time_agg not in _AGG_FUNCS:
        raise ValueError(
            f"Unknown aggregation '{time_agg}'; expected one of {_AGG_FUNCS}"
        )

    logger.info("spatial statistics: var=%s agg=%s", var_name, time_agg)

    var_data = self.vod_data[var_name]
    cell_ids = self.vod_data[self.cell_id_var]

    df = pd.DataFrame(
        {
            "cell_id": cell_ids.values.ravel(),
            "value": var_data.values.ravel(),
        }
    )
    df = df.dropna()

    if len(df) == 0:
        logger.warning("no valid data after removing NaN values")
        return {
            "grid_aligned": np.full(self.grid.ncells, np.nan),
            "patch_aligned": np.array([]),
            "cell_ids_with_data": np.array([], dtype=int),
            "metadata": {
                "valid_cells": 0,
                "total_cells": self.grid.ncells,
                "coverage_percent": 0.0,
                "variable": var_name,
                "aggregation": time_agg,
            },
        }

    logger.debug(
        "%d valid observations across %d cells",
        len(df),
        df["cell_id"].nunique(),
    )

    cell_stats = getattr(df.groupby("cell_id")["value"], time_agg)()

    grid_aligned = np.full(self.grid.ncells, np.nan)
    patch_aligned: list[float] = []
    cell_ids_with_data: list[int] = []

    for cell_id, value in cell_stats.items():
        cell_id_int = int(cell_id)
        if 0 <= cell_id_int < self.grid.ncells:
            grid_aligned[cell_id_int] = value
            patch_aligned.append(value)
            cell_ids_with_data.append(cell_id_int)

    n_valid = len(patch_aligned)
    logger.info("spatial stats: %d/%d cells with data", n_valid, self.grid.ncells)

    return {
        "grid_aligned": grid_aligned,
        "patch_aligned": np.array(patch_aligned),
        "cell_ids_with_data": np.array(cell_ids_with_data, dtype=int),
        "metadata": {
            "valid_cells": n_valid,
            "total_cells": self.grid.ncells,
            "coverage_percent": (n_valid / self.grid.ncells) * 100,
            "variable": var_name,
            "aggregation": time_agg,
        },
    }

compare_filtering_methods(original_var='VOD', filtered_var='VOD_filtered', figsize=(16, 6), ax=None)

Histogram comparison of original vs filtered VOD distributions.

Parameters

original_var : str, optional Unfiltered variable name. filtered_var : str, optional Filtered variable name. figsize : tuple, optional Figure size when ax is None. ax : tuple of two plt.Axes or None, optional Pre-existing axes pair. Created if None.

Returns

fig, axes : plt.Figure, np.ndarray of plt.Axes

Source code in packages/canvod-grids/src/canvod/grids/analysis/spatial.py
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
def compare_filtering_methods(
    self,
    original_var: str = "VOD",
    filtered_var: str = "VOD_filtered",
    figsize: tuple = (16, 6),
    ax: tuple | None = None,
) -> tuple[plt.Figure, np.ndarray]:
    """Histogram comparison of original vs filtered VOD distributions.

    Parameters
    ----------
    original_var : str, optional
        Unfiltered variable name.
    filtered_var : str, optional
        Filtered variable name.
    figsize : tuple, optional
        Figure size when *ax* is ``None``.
    ax : tuple of two plt.Axes or None, optional
        Pre-existing axes pair.  Created if ``None``.

    Returns
    -------
    fig, axes : plt.Figure, np.ndarray of plt.Axes

    """
    if ax is None:
        fig, axes = plt.subplots(1, 2, figsize=figsize)
    else:
        axes = ax
        fig = axes[0].figure

    stats_orig = self.compute_spatial_statistics(original_var, "mean")
    stats_filt = self.compute_spatial_statistics(filtered_var, "mean")

    data_orig = stats_orig["grid_aligned"]
    data_filt = stats_filt["grid_aligned"]

    axes[0].hist(
        data_orig[np.isfinite(data_orig)], bins=50, alpha=0.7, label="Original"
    )
    axes[0].set_title("Original VOD Distribution")
    axes[0].set_xlabel("VOD")
    axes[0].set_ylabel("Frequency")

    axes[1].hist(
        data_filt[np.isfinite(data_filt)],
        bins=50,
        alpha=0.7,
        label="Filtered",
        color="green",
    )
    axes[1].set_title("Filtered VOD Distribution")
    axes[1].set_xlabel("VOD")
    axes[1].set_ylabel("Frequency")

    fig.tight_layout()
    return fig, axes

Per-Cell VOD Analysis

Per-cell VOD analysis and plotting.

Statistical aggregation, diurnal dynamics, radial distributions, and theta-time heatmaps for per-cell VOD datasets. Handles single datasets or lists of datasets with configurable multi-dataset modes (separate vs averaged).

Classes

PerCellVODAnalyzer Main analysis class. Accepts one or more per-cell datasets (each must expose cell_timeseries, cell_theta, cell_phi).

Utility functions

extract_percell_stats – temporal statistic per cell. percell_to_grid_counts – total observation counts per cell. extract_percell_temporal_stats – range / trend / CV per cell. extract_percell_coverage – data-coverage percentage per cell. percell_to_grid_data – thin wrapper around :func:extract_percell_stats.

Notes

  • Per-cell datasets are expected to have dimensions (cell, time) and variables cell_timeseries, cell_theta, cell_phi, and optionally cell_weights and cell_counts.
  • Spatial visualisation (hemisphere maps) lives in canvod-viz.

PerCellVODAnalyzer

Multi-dataset per-cell VOD analyzer.

Parameters

datasets : xr.Dataset or list of xr.Dataset Per-cell dataset(s). Each must contain cell_timeseries, cell_theta, and cell_phi. labels : list of str or None, optional Human-readable labels for each dataset.

Raises

ValueError If any dataset is missing required variables.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_analysis.py
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
class PerCellVODAnalyzer:
    """Multi-dataset per-cell VOD analyzer.

    Parameters
    ----------
    datasets : xr.Dataset or list of xr.Dataset
        Per-cell dataset(s).  Each must contain ``cell_timeseries``,
        ``cell_theta``, and ``cell_phi``.
    labels : list of str or None, optional
        Human-readable labels for each dataset.

    Raises
    ------
    ValueError
        If any dataset is missing required variables.

    """

    def __init__(
        self,
        datasets: xr.Dataset | list[xr.Dataset],
        labels: list[str] | None = None,
    ) -> None:
        """Initialize the analyzer.

        Parameters
        ----------
        datasets : xr.Dataset | list[xr.Dataset]
            Per-cell dataset(s).
        labels : list[str] | None, optional
            Labels for each dataset.

        """
        if isinstance(datasets, xr.Dataset):
            self.datasets = [datasets]
            self.labels = ["Dataset"] if labels is None else [labels[0]]
        else:
            self.datasets = datasets
            self.labels = labels or [f"Dataset {i + 1}" for i in range(len(datasets))]

        self._validate_datasets()
        logger.info(
            "PerCellVODAnalyzer: %d dataset(s), shapes=%s",
            len(self.datasets),
            [ds.cell_timeseries.shape for ds in self.datasets],
        )

    def _validate_datasets(self) -> None:
        """Raise if any dataset is missing required variables."""
        required = {"cell_timeseries", "cell_theta", "cell_phi"}
        for i, ds in enumerate(self.datasets):
            missing = required - set(ds.data_vars)
            if missing:
                raise ValueError(f"Dataset {i + 1} missing variables: {missing}")

    # ------------------------------------------------------------------
    # Computation helpers
    # ------------------------------------------------------------------

    @staticmethod
    def _generate_dataset_hash(ds: xr.Dataset) -> str:
        """Deterministic hash for a per-cell dataset (for optional caching).

        Samples up to 10 000 values from ``cell_timeseries`` for
        efficiency on large datasets.
        """
        parts: list[str] = []

        parts.append(hashlib.md5(ds.time.values.tobytes()).hexdigest()[:8])

        cell_data = ds.cell_timeseries.values
        if cell_data.size > 10_000:
            idx = np.linspace(0, cell_data.size - 1, 10_000, dtype=int)
            sample = cell_data.flat[idx]
        else:
            sample = cell_data.flatten()

        valid = sample[np.isfinite(sample)]
        if len(valid) > 0:
            parts.append(hashlib.md5(valid.tobytes()).hexdigest()[:8])

        shape_str = f"{ds.cell_timeseries.shape}_{len(ds.time)}"
        parts.append(hashlib.md5(shape_str.encode()).hexdigest()[:8])

        return hashlib.md5("_".join(parts).encode()).hexdigest()[:16]

    def _compute_diurnal_dynamics(self, ds: xr.Dataset) -> dict:
        """Hourly diurnal statistics (mean, std, count) from one dataset.

        Uses ``cell_weights`` when available.
        """
        cell_ts = ds.cell_timeseries
        hours = ds.time.dt.hour.values
        unique_hours = np.unique(hours)

        means, stds, counts = [], [], []

        for hour in unique_hours:
            mask = hours == hour
            if not np.any(mask):
                means.append(np.nan)
                stds.append(np.nan)
                counts.append(0)
                continue

            hour_data = cell_ts[:, mask]

            if "cell_weights" in ds.data_vars:
                w = ds.cell_weights[:, mask]
                w_sum = np.nansum(w, axis=0)
                w_sum[w_sum == 0] = np.nan
                weighted_means = np.nansum(hour_data * w, axis=0) / w_sum
                means.append(float(np.nanmean(weighted_means)))
                stds.append(float(np.nanstd(weighted_means)))
            else:
                means.append(float(np.nanmean(hour_data)))
                stds.append(float(np.nanstd(hour_data)))

            counts.append(int(np.sum(np.isfinite(hour_data))))

        return {
            "hours": unique_hours,
            "means": np.array(means),
            "stds": np.array(stds),
            "counts": np.array(counts),
        }

    def _compute_diurnal_dynamics_30min(self, ds: xr.Dataset) -> dict:
        """30-minute-resolution diurnal statistics from one dataset."""
        cell_ts = ds.cell_timeseries
        hours = ds.time.dt.hour.values
        minutes = ds.time.dt.minute.values
        bins_30 = hours * 2 + (minutes >= 30).astype(int)
        unique_bins = np.unique(bins_30)

        means, stds, counts, labels = [], [], [], []

        for b in unique_bins:
            mask = bins_30 == b
            if not np.any(mask):
                means.append(np.nan)
                stds.append(np.nan)
                counts.append(0)
            else:
                bin_data = cell_ts[:, mask]
                if "cell_weights" in ds.data_vars:
                    w = ds.cell_weights[:, mask]
                    w_sum = np.nansum(w, axis=0)
                    w_sum[w_sum == 0] = np.nan
                    wm = np.nansum(bin_data * w, axis=0) / w_sum
                    means.append(float(np.nanmean(wm)))
                    stds.append(float(np.nanstd(wm)))
                else:
                    means.append(float(np.nanmean(bin_data)))
                    stds.append(float(np.nanstd(bin_data)))
                counts.append(int(np.sum(np.isfinite(bin_data))))

            h, m = int(b) // 2, 30 if (int(b) % 2) else 0
            labels.append(f"{h:02d}:{m:02d}")

        return {
            "time_bins": unique_bins,
            "time_labels": labels,
            "means": np.array(means),
            "stds": np.array(stds),
            "counts": np.array(counts),
        }

    def _compute_averaged_diurnal_dynamics(self) -> dict:
        """Average diurnal dynamics across all datasets.

        Uncertainty is propagated as σ_avg = √(Σσᵢ²) / N.
        """
        all_data = [self._compute_diurnal_dynamics(ds) for ds in self.datasets]
        all_hours = [d["hours"] for d in all_data]
        common = sorted(set.intersection(*[set(h) for h in all_hours]))

        avg_means, avg_stds, avg_counts = [], [], []
        for hour in common:
            h_means, h_stds, h_counts = [], [], []
            for d in all_data:
                idx = np.where(d["hours"] == hour)[0]
                if len(idx):
                    h_means.append(d["means"][idx[0]])
                    h_stds.append(d["stds"][idx[0]])
                    h_counts.append(d["counts"][idx[0]])
            if h_means:
                avg_means.append(np.nanmean(h_means))
                avg_stds.append(np.sqrt(np.nansum(np.array(h_stds) ** 2)) / len(h_stds))
                avg_counts.append(int(np.sum(h_counts)))

        return {
            "hours": np.array(common),
            "means": np.array(avg_means),
            "stds": np.array(avg_stds),
            "counts": np.array(avg_counts),
        }

    def _compute_combined_diurnal_distributions(self) -> dict[int, list[float]]:
        """Collect all hourly value distributions across all datasets."""
        combined: dict[int, list[float]] = {}
        for hour in range(24):
            vals: list[float] = []
            for ds in self.datasets:
                mask = ds.time.dt.hour.values == hour
                if np.any(mask):
                    raw = ds.cell_timeseries[:, mask].values.flatten()
                    vals.extend(raw[np.isfinite(raw)].tolist())
            if vals:
                combined[hour] = vals
        return combined

    def _compute_radial_distribution(self, ds: xr.Dataset) -> dict:
        """Bin mean VOD by polar angle (5° bins, 0–90°)."""
        polar = 90.0 - ds.cell_theta.values
        mean_vod = ds.cell_timeseries.mean(dim="time").values

        edges = np.arange(0, 95, 5)
        centers = edges[:-1] + 2.5
        indices = np.digitize(polar, edges)

        binned, labels = [], []
        for i, c in enumerate(centers):
            mask = indices == i + 1
            if np.any(mask):
                clean = mean_vod[mask]
                clean = clean[np.isfinite(clean)]
                if len(clean) > 0:
                    binned.append(clean)
                    labels.append(f"{edges[i]:.0f}-{edges[i + 1]:.0f}°")

        return {
            "binned_data": binned,
            "bin_labels": labels,
            "bin_centers": centers[: len(binned)],
        }

    def _compute_averaged_radial_distribution(self) -> dict:
        """Average radial distributions across all datasets."""
        all_radial = [self._compute_radial_distribution(ds) for ds in self.datasets]
        all_centers = [d["bin_centers"] for d in all_radial]
        common = sorted(set.intersection(*[set(c) for c in all_centers]))

        avg_binned, avg_labels = [], []
        for c in common:
            combined: list[float] = []
            for d in all_radial:
                idx = np.where(np.abs(d["bin_centers"] - c) < 0.1)[0]
                if len(idx):
                    combined.extend(d["binned_data"][idx[0]])
            if combined:
                avg_binned.append(combined)
                avg_labels.append(f"{c - 2.5:.0f}-{c + 2.5:.0f}°")

        return {
            "binned_data": avg_binned,
            "bin_labels": avg_labels,
            "bin_centers": np.array(common),
        }

    def _compute_single_theta_time_heatmap(
        self, ds: xr.Dataset, time_aggregation: str, theta_bins: int
    ) -> tuple[np.ndarray, dict]:
        """Compute one theta × time heatmap."""
        polar = 90.0 - ds.cell_theta.values
        cell_ts = ds.cell_timeseries.values
        time_coord = ds.time.values

        edges = np.linspace(0, 90, theta_bins + 1)
        time_info = self._process_time_aggregation(time_coord, time_aggregation)

        heatmap = np.full((theta_bins, time_info["n_time_bins"]), np.nan)

        for i in range(theta_bins):
            theta_mask = (polar >= edges[i]) & (polar < edges[i + 1])
            if not np.any(theta_mask):
                continue
            bin_ts = cell_ts[theta_mask, :]
            for t in range(time_info["n_time_bins"]):
                t_mask = time_info["time_groups"] == t
                if np.any(t_mask):
                    heatmap[i, t] = np.nanmean(bin_ts[:, t_mask])

        return heatmap, time_info

    def _compute_averaged_theta_time_heatmap(
        self, time_aggregation: str, theta_bins: int
    ) -> tuple[np.ndarray, dict]:
        """Average theta-time heatmaps across all datasets."""
        heatmaps, time_info = [], None
        for ds in self.datasets:
            h, ti = self._compute_single_theta_time_heatmap(
                ds,
                time_aggregation,
                theta_bins,
            )
            heatmaps.append(h)
            if time_info is None:
                time_info = ti
        return np.nanmean(np.stack(heatmaps, axis=0), axis=0), time_info

    @staticmethod
    def _process_time_aggregation(
        time_coord: np.ndarray,
        time_aggregation: str,
    ) -> dict:
        """Map raw timestamps to integer group indices.

        Returns
        -------
        dict
            ``time_groups``, ``n_time_bins``, ``time_ticks``,
            ``time_tick_labels``, ``x_label``.

        """
        if time_aggregation == "diurnal":
            groups = pd.to_datetime(time_coord).hour.values
            n = 24
            ticks = np.arange(0, 24, 4)
            tick_labels = [f"{h:02d}:00" for h in ticks]
            xlabel = "Hour of Day (UTC)"

        elif time_aggregation == "daily":
            groups = np.arange(len(time_coord))
            n = len(time_coord)
            ticks = np.arange(0, n, max(1, n // 10))
            tick_labels = (
                pd.to_datetime(time_coord)[ticks].strftime("%Y-%m-%d").tolist()
            )
            xlabel = "Date"

        elif time_aggregation == "weekly":
            dates = pd.to_datetime(time_coord)
            weeks = dates.isocalendar().week
            unique_weeks = np.unique(weeks)
            groups = np.searchsorted(unique_weeks, weeks)
            n = len(unique_weeks)
            ticks = np.arange(0, n, max(1, n // 10))
            tick_labels = [f"Week {unique_weeks[i]}" for i in ticks]
            xlabel = "Week"

        elif time_aggregation == "monthly":
            dates = pd.to_datetime(time_coord)
            months = dates.month
            unique_months = np.unique(months)
            groups = np.searchsorted(unique_months, months)
            n = len(unique_months)
            ticks = np.arange(0, n)
            tick_labels = [f"Month {unique_months[i]}" for i in ticks]
            xlabel = "Month"

        else:
            raise ValueError(f"Unknown time_aggregation: '{time_aggregation}'")

        return {
            "time_groups": groups,
            "n_time_bins": n,
            "time_ticks": ticks,
            "time_tick_labels": tick_labels,
            "x_label": xlabel,
        }

__init__(datasets, labels=None)

Initialize the analyzer.

Parameters

datasets : xr.Dataset | list[xr.Dataset] Per-cell dataset(s). labels : list[str] | None, optional Labels for each dataset.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_analysis.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def __init__(
    self,
    datasets: xr.Dataset | list[xr.Dataset],
    labels: list[str] | None = None,
) -> None:
    """Initialize the analyzer.

    Parameters
    ----------
    datasets : xr.Dataset | list[xr.Dataset]
        Per-cell dataset(s).
    labels : list[str] | None, optional
        Labels for each dataset.

    """
    if isinstance(datasets, xr.Dataset):
        self.datasets = [datasets]
        self.labels = ["Dataset"] if labels is None else [labels[0]]
    else:
        self.datasets = datasets
        self.labels = labels or [f"Dataset {i + 1}" for i in range(len(datasets))]

    self._validate_datasets()
    logger.info(
        "PerCellVODAnalyzer: %d dataset(s), shapes=%s",
        len(self.datasets),
        [ds.cell_timeseries.shape for ds in self.datasets],
    )

extract_percell_stats(percell_ds, stat='median')

Compute a single temporal statistic for every cell.

Parameters

percell_ds : xr.Dataset Per-cell dataset with a cell_timeseries variable of shape (cell, time). stat : {"mean", "median", "std"} Statistic to reduce across the time dimension.

Returns

np.ndarray 1-D array of length n_cells.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_analysis.py
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
def extract_percell_stats(
    percell_ds: xr.Dataset,
    stat: Literal["mean", "median", "std"] = "median",
) -> np.ndarray:
    """Compute a single temporal statistic for every cell.

    Parameters
    ----------
    percell_ds : xr.Dataset
        Per-cell dataset with a ``cell_timeseries`` variable of shape
        ``(cell, time)``.
    stat : {"mean", "median", "std"}
        Statistic to reduce across the ``time`` dimension.

    Returns
    -------
    np.ndarray
        1-D array of length ``n_cells``.

    """
    cell_timeseries = percell_ds.cell_timeseries

    if stat == "mean":
        cell_values = cell_timeseries.mean(dim="time")
    elif stat == "median":
        cell_values = cell_timeseries.median(dim="time")
    elif stat == "std":
        cell_values = cell_timeseries.std(dim="time")
    else:
        raise ValueError(f"Unsupported stat: {stat}")

    return cell_values.values

percell_to_grid_counts(percell_ds)

Sum observation counts across time for each cell.

Parameters

percell_ds : xr.Dataset Per-cell dataset with a cell_counts variable of shape (cell, time).

Returns

np.ndarray 1-D array of total counts per cell.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_analysis.py
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
def percell_to_grid_counts(percell_ds: xr.Dataset) -> np.ndarray:
    """Sum observation counts across time for each cell.

    Parameters
    ----------
    percell_ds : xr.Dataset
        Per-cell dataset with a ``cell_counts`` variable of shape
        ``(cell, time)``.

    Returns
    -------
    np.ndarray
        1-D array of total counts per cell.

    """
    cell_counts = percell_ds.cell_counts
    return cell_counts.sum(dim="time").values

extract_percell_temporal_stats(percell_ds, stat='range')

Compute a temporal-characteristic statistic for every cell.

Parameters

percell_ds : xr.Dataset Per-cell dataset with cell_timeseries. stat : {"range", "trend", "cv"} Statistic:

* ``"range"`` – max − min over time.
* ``"trend"`` – linear-regression slope (requires ≥ 4 valid points;
  otherwise NaN).  Uses :mod:`scipy.stats`.
* ``"cv"``    – coefficient of variation (std / mean).

Returns

np.ndarray 1-D array of length n_cells.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_analysis.py
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
def extract_percell_temporal_stats(
    percell_ds: xr.Dataset,
    stat: Literal["range", "trend", "cv"] = "range",
) -> np.ndarray:
    """Compute a temporal-characteristic statistic for every cell.

    Parameters
    ----------
    percell_ds : xr.Dataset
        Per-cell dataset with ``cell_timeseries``.
    stat : {"range", "trend", "cv"}
        Statistic:

        * ``"range"`` – max − min over time.
        * ``"trend"`` – linear-regression slope (requires ≥ 4 valid points;
          otherwise NaN).  Uses :mod:`scipy.stats`.
        * ``"cv"``    – coefficient of variation (std / mean).

    Returns
    -------
    np.ndarray
        1-D array of length ``n_cells``.

    """
    from scipy import stats

    cell_timeseries = percell_ds.cell_timeseries

    if stat == "range":
        cell_max = cell_timeseries.max(dim="time")
        cell_min = cell_timeseries.min(dim="time")
        result = cell_max - cell_min

    elif stat == "trend":
        trends = []
        for i in range(cell_timeseries.shape[0]):
            cell_data = cell_timeseries.values[i, :]
            valid_mask = np.isfinite(cell_data)

            if np.sum(valid_mask) > 3:
                time_indices = np.arange(len(cell_data))[valid_mask]
                values = cell_data[valid_mask]
                slope, _, _, _, _ = stats.linregress(time_indices, values)
                trends.append(slope)
            else:
                trends.append(np.nan)

        result = xr.DataArray(trends, dims=["cell"])

    elif stat == "cv":
        cell_mean = cell_timeseries.mean(dim="time")
        cell_std = cell_timeseries.std(dim="time")
        result = cell_std / cell_mean

    else:
        raise ValueError(f"Unsupported temporal stat: {stat}")

    return result.values

extract_percell_coverage(percell_ds)

Compute the fraction of valid (non-NaN) observations per cell.

Parameters

percell_ds : xr.Dataset Per-cell dataset with cell_timeseries.

Returns

np.ndarray 1-D array of coverage percentages (0–100) per cell.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_analysis.py
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
def extract_percell_coverage(percell_ds: xr.Dataset) -> np.ndarray:
    """Compute the fraction of valid (non-NaN) observations per cell.

    Parameters
    ----------
    percell_ds : xr.Dataset
        Per-cell dataset with ``cell_timeseries``.

    Returns
    -------
    np.ndarray
        1-D array of coverage percentages (0–100) per cell.

    """
    cell_timeseries = percell_ds.cell_timeseries

    valid_count = np.isfinite(cell_timeseries).sum(dim="time")
    total_count = cell_timeseries.sizes["time"]

    coverage_pct = (valid_count / total_count) * 100
    return coverage_pct.values

percell_to_grid_data(percell_ds, stat='median')

Thin wrapper around :func:extract_percell_stats.

Provided for symmetry with aggregate_data_to_grid workflows.

Parameters

percell_ds : xr.Dataset Per-cell dataset. stat : {"mean", "median", "std"} Statistic to compute.

Returns

np.ndarray 1-D array compatible with grid visualisation.

Source code in packages/canvod-grids/src/canvod/grids/analysis/per_cell_analysis.py
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
def percell_to_grid_data(
    percell_ds: xr.Dataset,
    stat: Literal["mean", "median", "std"] = "median",
) -> np.ndarray:
    """Thin wrapper around :func:`extract_percell_stats`.

    Provided for symmetry with ``aggregate_data_to_grid`` workflows.

    Parameters
    ----------
    percell_ds : xr.Dataset
        Per-cell dataset.
    stat : {"mean", "median", "std"}
        Statistic to compute.

    Returns
    -------
    np.ndarray
        1-D array compatible with grid visualisation.

    """
    return extract_percell_stats(percell_ds, stat=stat)

Analysis Storage

Persistent storage for precomputed analysis results via Icechunk.

Stores dataset+grid-specific analysis outputs in an Icechunk repository under metadata/{dataset_name}/{grid_name}/:

  • weights – per-cell weight arrays (ncells,).
  • filter_masks – per-observation statistical masks (epoch × sid).
  • spatial_masks – per-cell geometric selection masks (ncells,).
  • statistics – per-cell aggregated statistics (ncells,).

Depends on canvod-store at runtime. Install the package first::

uv add canvod-store

Classes

AnalysisStorage Read / write / delete analysis metadata for a single Icechunk store.

AnalysisStorage

Manage persistent storage of analysis results for dataset+grid pairs.

Storage layout inside the Icechunk repository::

metadata/{dataset_name}/{grid_name}/
├── weights/              # (ncells,)
│   ├── observation_count
│   ├── solid_angle
│   └── combined
├── filter_masks/         # (epoch, sid)
│   ├── mask_iqr
│   └── mask_zscore
├── spatial_masks/        # (ncells,)
│   ├── mask_north
│   └── mask_high_elevation
└── statistics/           # (ncells,)
    ├── obs_count
    ├── mean_vod
    └── std_vod

Parameters

store_path : Path or str Path to the VOD Icechunk store directory.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
class AnalysisStorage:
    """Manage persistent storage of analysis results for dataset+grid pairs.

    Storage layout inside the Icechunk repository::

        metadata/{dataset_name}/{grid_name}/
        ├── weights/              # (ncells,)
        │   ├── observation_count
        │   ├── solid_angle
        │   └── combined
        ├── filter_masks/         # (epoch, sid)
        │   ├── mask_iqr
        │   └── mask_zscore
        ├── spatial_masks/        # (ncells,)
        │   ├── mask_north
        │   └── mask_high_elevation
        └── statistics/           # (ncells,)
            ├── obs_count
            ├── mean_vod
            └── std_vod

    Parameters
    ----------
    store_path : Path or str
        Path to the VOD Icechunk store directory.

    """

    def __init__(self, store_path: Path | str) -> None:
        """Initialize the storage manager.

        Parameters
        ----------
        store_path : Path | str
            Path to the VOD Icechunk store directory.

        """
        self.store_path = Path(store_path)
        self.store: MyIcechunkStore = _get_store(self.store_path)

    def __repr__(self) -> str:
        """Return the developer-facing representation.

        Returns
        -------
        str
            Representation string.

        """
        return f"AnalysisStorage(store_path={self.store_path})"

    # ------------------------------------------------------------------
    # Weights
    # ------------------------------------------------------------------

    def store_weights(
        self,
        dataset_name: str,
        grid_name: str,
        weights: dict[str, np.ndarray],
        weight_params: dict[str, dict] | None = None,
        overwrite: bool = False,
    ) -> str:
        """Store per-cell weight arrays.

        Parameters
        ----------
        dataset_name : str
            Dataset identifier (e.g. ``'reference_01_canopy_01'``).
        grid_name : str
            Grid identifier (e.g. ``'equal_area_2deg'``).
        weights : dict
            ``{name: array}`` – all arrays must have shape ``(ncells,)``.
        weight_params : dict, optional
            Parameters used to compute each weight type.
        overwrite : bool
            Overwrite existing weights.

        Returns
        -------
        str
            Icechunk snapshot ID.

        """
        group_path = f"metadata/{dataset_name}/{grid_name}/weights"
        logger.info("Storing weights to %s", group_path)

        # Validate shapes
        ncells = len(next(iter(weights.values())))
        for name, arr in weights.items():
            if arr.shape != (ncells,):
                raise ValueError(
                    f"Weight '{name}' has shape {arr.shape}, expected ({ncells},)"
                )

        # Build xarray dataset
        weight_vars = {
            name: (["cell"], arr.astype(np.float32)) for name, arr in weights.items()
        }
        ds_weights = xr.Dataset(
            weight_vars, coords={"cell": np.arange(ncells, dtype=np.int32)}
        )

        attrs: dict[str, Any] = {
            "created_at": datetime.now().isoformat(),
            "dataset": dataset_name,
            "grid": grid_name,
            "weight_types": list(weights.keys()),
            "ncells": ncells,
        }
        if weight_params:
            attrs["weight_parameters"] = str(weight_params)
        ds_weights.attrs.update(attrs)

        # Persist
        with self.store.writable_session() as session:
            from icechunk.xarray import to_icechunk

            mode = "w" if overwrite else "w-"
            to_icechunk(ds_weights, session, group=group_path, mode=mode)
            snapshot_id: str = session.commit(
                f"Stored weights for {dataset_name}/{grid_name}"
            )

        logger.info("Weights stored (snapshot: %s)", snapshot_id[:8])
        return snapshot_id

    def load_weights(
        self,
        dataset_name: str,
        grid_name: str,
        weight_type: str | None = None,
    ) -> dict[str, np.ndarray]:
        """Load stored weight arrays.

        Parameters
        ----------
        dataset_name : str
            Dataset identifier.
        grid_name : str
            Grid identifier.
        weight_type : str, optional
            Load only this weight.  ``None`` loads all.

        Returns
        -------
        dict
            ``{name: ndarray}`` of loaded weights.

        """
        group_path = f"metadata/{dataset_name}/{grid_name}/weights"

        try:
            with self.store.readonly_session() as session:
                ds_weights = xr.open_zarr(
                    session.store, group=group_path, consolidated=False
                )

            if weight_type:
                if weight_type not in ds_weights:
                    raise ValueError(
                        f"Weight '{weight_type}' not found. "
                        f"Available: {list(ds_weights.data_vars)}"
                    )
                return {weight_type: ds_weights[weight_type].values}
            return {var: ds_weights[var].values for var in ds_weights.data_vars}

        except Exception:
            logger.error("Failed to load weights from %s", group_path, exc_info=True)
            raise

    def has_weights(self, dataset_name: str, grid_name: str) -> bool:
        """Return ``True`` if weights exist for the dataset+grid pair."""
        try:
            self.load_weights(dataset_name, grid_name)
            return True
        except Exception:
            return False

    # ------------------------------------------------------------------
    # Filter masks (per observation)
    # ------------------------------------------------------------------

    def store_filter_masks(
        self,
        dataset_name: str,
        grid_name: str,
        masks: dict[str, xr.DataArray],
        filter_params: dict[str, dict] | None = None,
        overwrite: bool = False,
    ) -> str:
        """Store per-observation filter masks at native resolution.

        Parameters
        ----------
        dataset_name : str
            Dataset identifier.
        grid_name : str
            Grid identifier.
        masks : dict
            ``{filter_name: DataArray}`` – all must share the same
            ``(epoch, sid)`` shape.
        filter_params : dict, optional
            Parameters used for each filter.
        overwrite : bool
            Overwrite existing masks.

        Returns
        -------
        str
            Icechunk snapshot ID.

        """
        group_path = f"metadata/{dataset_name}/{grid_name}/filter_masks"
        logger.info("Storing filter masks to %s", group_path)

        first_mask = next(iter(masks.values()))
        shape = first_mask.shape

        mask_vars: dict[str, xr.DataArray] = {}
        for name, mask_array in masks.items():
            if not isinstance(mask_array, xr.DataArray):
                raise TypeError(f"Mask '{name}' must be xr.DataArray")
            if mask_array.shape != shape:
                raise ValueError("All masks must have same shape")
            mask_vars[f"mask_{name}"] = mask_array.astype(np.int8)

        ds_masks = xr.Dataset(mask_vars)

        attrs: dict[str, Any] = {
            "created_at": datetime.now().isoformat(),
            "dataset": dataset_name,
            "grid": grid_name,
            "filter_types": list(masks.keys()),
            "shape": str(shape),
            "coordinate_source": f"/{dataset_name}/",
        }
        if filter_params:
            attrs["filter_parameters"] = str(filter_params)
        ds_masks.attrs.update(attrs)

        # Rechunk for efficient columnar storage
        logger.info("Rechunking masks for efficient storage")
        ds_masks = ds_masks.chunk({"epoch": 10000, "sid": -1})

        with self.store.writable_session() as session:
            import dask
            from icechunk.xarray import to_icechunk

            logger.info("Writing masks (this may take a few minutes)")
            with dask.config.set(scheduler="threads", num_workers=4):
                mode = "w" if overwrite else "w-"
                to_icechunk(ds_masks, session, group=group_path, mode=mode)

            logger.info("Committing")
            snapshot_id = session.commit(
                f"Stored filter masks for {dataset_name}/{grid_name}"
            )

        logger.info("Filter masks stored (snapshot: %s)", snapshot_id[:8])
        return snapshot_id

    def load_filter_mask(
        self,
        dataset_name: str,
        grid_name: str,
        filter_type: str,
        attach_coords: bool = True,
    ) -> xr.DataArray:
        """Load a single filter mask.

        Parameters
        ----------
        dataset_name : str
            Dataset identifier.
        grid_name : str
            Grid identifier.
        filter_type : str
            Filter name (e.g. ``'iqr'``, ``'zscore'``).
        attach_coords : bool
            Re-attach ``epoch`` / ``sid`` coordinates from the source
            dataset group.  Set ``False`` for faster loading when
            coordinates are not needed.

        Returns
        -------
        xr.DataArray
            Boolean mask with shape ``(epoch, sid)``.

        """
        group_path = f"metadata/{dataset_name}/{grid_name}/filter_masks"

        try:
            with self.store.readonly_session() as session:
                ds_masks = xr.open_zarr(
                    session.store, group=group_path, consolidated=False
                )

            mask_var = f"mask_{filter_type}"
            if mask_var not in ds_masks:
                available = [v.replace("mask_", "") for v in ds_masks.data_vars]
                raise ValueError(
                    f"Filter mask '{filter_type}' not found. Available: {available}"
                )

            mask = ds_masks[mask_var].astype(bool)

            if attach_coords:
                coord_source = ds_masks.attrs.get(
                    "coordinate_source", f"/{dataset_name}/"
                )
                with self.store.readonly_session() as session:
                    ds_source = xr.open_zarr(
                        session.store,
                        group=coord_source.strip("/"),
                        consolidated=False,
                    )

                mask = mask.assign_coords(
                    {"epoch": ds_source["epoch"], "sid": ds_source["sid"]}
                )
                for coord in [
                    "band",
                    "code",
                    "sv",
                    "system",
                    "freq_min",
                    "freq_max",
                    "freq_center",
                ]:
                    if coord in ds_source.coords:
                        mask = mask.assign_coords({coord: ds_source[coord]})

            return mask

        except Exception:
            logger.error("Failed to load filter mask", exc_info=True)
            raise

    def load_all_filter_masks(
        self, dataset_name: str, grid_name: str
    ) -> dict[str, xr.DataArray]:
        """Load all stored filter masks.

        Returns
        -------
        dict
            ``{filter_name: DataArray}`` – boolean masks.

        """
        group_path = f"metadata/{dataset_name}/{grid_name}/filter_masks"

        try:
            with self.store.readonly_session() as session:
                ds_masks = xr.open_zarr(
                    session.store, group=group_path, consolidated=False
                )
            return {
                var.replace("mask_", ""): ds_masks[var].astype(bool)
                for var in ds_masks.data_vars
            }
        except Exception:
            logger.error(
                "Failed to load filter masks from %s", group_path, exc_info=True
            )
            raise

    def has_filter_masks(self, dataset_name: str, grid_name: str) -> bool:
        """Return ``True`` if filter masks exist for the dataset+grid pair."""
        try:
            self.load_all_filter_masks(dataset_name, grid_name)
            return True
        except Exception:
            return False

    # ------------------------------------------------------------------
    # Spatial masks (per cell)
    # ------------------------------------------------------------------

    def store_spatial_masks(
        self,
        dataset_name: str,
        grid_name: str,
        masks: dict[str, np.ndarray],
        mask_descriptions: dict[str, str] | None = None,
        overwrite: bool = False,
    ) -> str:
        """Store per-cell geometric selection masks.

        Parameters
        ----------
        dataset_name : str
            Dataset identifier.
        grid_name : str
            Grid identifier.
        masks : dict
            ``{name: bool_array}`` – all arrays must have shape ``(ncells,)``
            and boolean dtype.
        mask_descriptions : dict, optional
            Human-readable description for each mask (stored as variable attrs).
        overwrite : bool
            Overwrite existing masks.

        Returns
        -------
        str
            Icechunk snapshot ID.

        """
        group_path = f"metadata/{dataset_name}/{grid_name}/spatial_masks"
        logger.info("Storing spatial masks to %s", group_path)

        ncells = len(next(iter(masks.values())))
        for name, arr in masks.items():
            if arr.shape != (ncells,):
                raise ValueError(
                    f"Mask '{name}' has shape {arr.shape}, expected ({ncells},)"
                )
            if arr.dtype != bool:
                raise ValueError(
                    f"Mask '{name}' must be boolean dtype, got {arr.dtype}"
                )

        mask_vars = {
            f"mask_{name}": (["cell"], arr.astype(np.int8))
            for name, arr in masks.items()
        }
        ds_masks = xr.Dataset(
            mask_vars, coords={"cell": np.arange(ncells, dtype=np.int32)}
        )

        # Per-variable metadata
        for name, arr in masks.items():
            n_sel = int(arr.sum())
            ds_masks[f"mask_{name}"].attrs["n_cells_selected"] = n_sel
            ds_masks[f"mask_{name}"].attrs["fraction_selected"] = float(n_sel / ncells)
        if mask_descriptions:
            for name, desc in mask_descriptions.items():
                if name in masks:
                    ds_masks[f"mask_{name}"].attrs["description"] = desc

        ds_masks.attrs.update(
            {
                "created_at": datetime.now().isoformat(),
                "dataset": dataset_name,
                "grid": grid_name,
                "mask_types": list(masks.keys()),
                "ncells": ncells,
            }
        )

        with self.store.writable_session() as session:
            from icechunk.xarray import to_icechunk

            mode = "w" if overwrite else "w-"
            to_icechunk(ds_masks, session, group=group_path, mode=mode)
            snapshot_id = session.commit(
                f"Stored spatial masks for {dataset_name}/{grid_name}"
            )

        logger.info("Spatial masks stored (snapshot: %s)", snapshot_id[:8])
        return snapshot_id

    def load_spatial_mask(
        self, dataset_name: str, grid_name: str, mask_name: str
    ) -> np.ndarray:
        """Load a single spatial mask.

        Parameters
        ----------
        dataset_name : str
            Dataset identifier.
        grid_name : str
            Grid identifier.
        mask_name : str
            Mask name (e.g. ``'north'``, ``'high_elevation'``).

        Returns
        -------
        np.ndarray
            Boolean array with shape ``(ncells,)``.

        """
        group_path = f"metadata/{dataset_name}/{grid_name}/spatial_masks"

        try:
            with self.store.readonly_session() as session:
                ds_masks = xr.open_zarr(
                    session.store, group=group_path, consolidated=False
                )

            mask_var = f"mask_{mask_name}"
            if mask_var not in ds_masks:
                available = [v.replace("mask_", "") for v in ds_masks.data_vars]
                raise ValueError(
                    f"Spatial mask '{mask_name}' not found. Available: {available}"
                )
            return ds_masks[mask_var].values.astype(bool)

        except Exception:
            logger.error(
                "Failed to load spatial mask from %s", group_path, exc_info=True
            )
            raise

    def load_all_spatial_masks(
        self, dataset_name: str, grid_name: str
    ) -> dict[str, np.ndarray]:
        """Load all spatial masks.

        Returns
        -------
        dict
            ``{name: bool_ndarray}``.

        """
        group_path = f"metadata/{dataset_name}/{grid_name}/spatial_masks"

        try:
            with self.store.readonly_session() as session:
                ds_masks = xr.open_zarr(
                    session.store, group=group_path, consolidated=False
                )
            return {
                var.replace("mask_", ""): ds_masks[var].values.astype(bool)
                for var in ds_masks.data_vars
            }
        except Exception:
            logger.error(
                "Failed to load spatial masks from %s", group_path, exc_info=True
            )
            raise

    def has_spatial_masks(self, dataset_name: str, grid_name: str) -> bool:
        """Return ``True`` if spatial masks exist for the dataset+grid pair."""
        try:
            self.load_all_spatial_masks(dataset_name, grid_name)
            return True
        except Exception:
            return False

    # ------------------------------------------------------------------
    # Statistics
    # ------------------------------------------------------------------

    def store_statistics(
        self,
        dataset_name: str,
        grid_name: str,
        stats: dict[str, np.ndarray],
        overwrite: bool = False,
    ) -> str:
        """Store pre-computed per-cell statistics.

        Parameters
        ----------
        dataset_name : str
            Dataset identifier.
        grid_name : str
            Grid identifier.
        stats : dict
            ``{name: array}`` – all arrays must have shape ``(ncells,)``.
            Variables whose name ends with ``'_count'`` or equals
            ``'obs_count'`` are stored as ``int64``; everything else as
            ``float32``.
        overwrite : bool
            Overwrite existing statistics.

        Returns
        -------
        str
            Icechunk snapshot ID.

        """
        group_path = f"metadata/{dataset_name}/{grid_name}/statistics"
        logger.info("Storing statistics to %s", group_path)

        ncells = len(next(iter(stats.values())))
        for name, arr in stats.items():
            if arr.shape != (ncells,):
                raise ValueError(
                    f"Statistic '{name}' has shape {arr.shape}, expected ({ncells},)"
                )

        stat_vars = {}
        for name, arr in stats.items():
            dtype = (
                np.int64
                if (name.endswith("_count") or name == "obs_count")
                else np.float32
            )
            stat_vars[name] = (["cell"], arr.astype(dtype))

        ds_stats = xr.Dataset(
            stat_vars, coords={"cell": np.arange(ncells, dtype=np.int32)}
        )

        # Summary statistics per variable (stored as variable attrs)
        for name, arr in stats.items():
            valid = np.isfinite(arr)
            if np.any(valid):
                ds_stats[name].attrs.update(
                    {
                        "min": float(np.nanmin(arr)),
                        "max": float(np.nanmax(arr)),
                        "mean": float(np.nanmean(arr)),
                        "n_valid": int(np.sum(valid)),
                    }
                )

        ds_stats.attrs.update(
            {
                "created_at": datetime.now().isoformat(),
                "dataset": dataset_name,
                "grid": grid_name,
                "statistics": list(stats.keys()),
                "ncells": ncells,
            }
        )

        with self.store.writable_session() as session:
            from icechunk.xarray import to_icechunk

            mode = "w" if overwrite else "w-"
            to_icechunk(ds_stats, session, group=group_path, mode=mode)
            snapshot_id = session.commit(
                f"Stored statistics for {dataset_name}/{grid_name}"
            )

        logger.info("Statistics stored (snapshot: %s)", snapshot_id[:8])
        return snapshot_id

    def load_statistics(
        self,
        dataset_name: str,
        grid_name: str,
        stat_name: str | None = None,
    ) -> dict[str, np.ndarray]:
        """Load pre-computed per-cell statistics.

        Parameters
        ----------
        dataset_name : str
            Dataset identifier.
        grid_name : str
            Grid identifier.
        stat_name : str, optional
            Load only this statistic.  ``None`` loads all.

        Returns
        -------
        dict
            ``{name: ndarray}``.

        """
        group_path = f"metadata/{dataset_name}/{grid_name}/statistics"

        try:
            with self.store.readonly_session() as session:
                ds_stats = xr.open_zarr(
                    session.store, group=group_path, consolidated=False
                )

            if stat_name:
                if stat_name not in ds_stats:
                    raise ValueError(
                        f"Statistic '{stat_name}' not found. "
                        f"Available: {list(ds_stats.data_vars)}"
                    )
                return {stat_name: ds_stats[stat_name].values}
            return {var: ds_stats[var].values for var in ds_stats.data_vars}

        except Exception:
            logger.error("Failed to load statistics from %s", group_path, exc_info=True)
            raise

    def has_statistics(self, dataset_name: str, grid_name: str) -> bool:
        """Return ``True`` if statistics exist for the dataset+grid pair."""
        try:
            self.load_statistics(dataset_name, grid_name)
            return True
        except Exception:
            return False

    # ------------------------------------------------------------------
    # Utilities
    # ------------------------------------------------------------------

    def list_available_metadata(
        self, dataset_name: str, grid_name: str
    ) -> dict[str, bool]:
        """Check which metadata categories are stored.

        Returns
        -------
        dict
            ``{category: bool}`` for weights, filter_masks, spatial_masks,
            statistics.

        """
        return {
            "weights": self.has_weights(dataset_name, grid_name),
            "filter_masks": self.has_filter_masks(dataset_name, grid_name),
            "spatial_masks": self.has_spatial_masks(dataset_name, grid_name),
            "statistics": self.has_statistics(dataset_name, grid_name),
        }

    def get_metadata_summary(self, dataset_name: str, grid_name: str) -> dict[str, Any]:
        """Detailed summary of all stored metadata for a dataset+grid pair.

        Returns
        -------
        dict
            Nested summary with availability flags and per-category details.

        """
        summary: dict[str, Any] = {
            "dataset": dataset_name,
            "grid": grid_name,
            "available": self.list_available_metadata(dataset_name, grid_name),
        }

        if summary["available"]["weights"]:
            weights = self.load_weights(dataset_name, grid_name)
            summary["weights"] = {
                "types": list(weights.keys()),
                "ncells": len(next(iter(weights.values()))),
            }

        if summary["available"]["filter_masks"]:
            masks = self.load_all_filter_masks(dataset_name, grid_name)
            summary["filter_masks"] = {
                "types": list(masks.keys()),
                "shape": next(iter(masks.values())).shape,
            }

        if summary["available"]["spatial_masks"]:
            masks = self.load_all_spatial_masks(dataset_name, grid_name)
            summary["spatial_masks"] = {
                "types": list(masks.keys()),
                "ncells": len(next(iter(masks.values()))),
            }

        if summary["available"]["statistics"]:
            stats = self.load_statistics(dataset_name, grid_name)
            summary["statistics"] = {
                "types": list(stats.keys()),
                "ncells": len(next(iter(stats.values()))),
            }

        return summary

    # ------------------------------------------------------------------
    # Deletion
    # ------------------------------------------------------------------

    def _delete_group(self, group_path: str, label: str) -> str:
        """Generic group deletion helper.

        Parameters
        ----------
        group_path : str
            Zarr group path inside the store.
        label : str
            Human-readable label for log / commit messages.

        Returns
        -------
        str
            Snapshot ID.

        Raises
        ------
        ValueError
            If the group does not exist.

        """
        logger.info("Deleting %s at %s", label, group_path)

        with self.store.writable_session() as session:
            import zarr

            store = zarr.open(session.store, mode="r+")
            if group_path in store:
                del store[group_path]
                snapshot_id = session.commit(f"Deleted {label}")
                logger.info("%s deleted (snapshot: %s)", label, snapshot_id[:8])
                return snapshot_id
            raise ValueError(f"Group {group_path} does not exist")

    def delete_weights(self, dataset_name: str, grid_name: str) -> str:
        """Delete all weights for a dataset+grid pair."""
        return self._delete_group(
            f"metadata/{dataset_name}/{grid_name}/weights",
            f"weights for {dataset_name}/{grid_name}",
        )

    def delete_filter_masks(self, dataset_name: str, grid_name: str) -> str:
        """Delete all filter masks for a dataset+grid pair."""
        return self._delete_group(
            f"metadata/{dataset_name}/{grid_name}/filter_masks",
            f"filter masks for {dataset_name}/{grid_name}",
        )

    def delete_spatial_masks(self, dataset_name: str, grid_name: str) -> str:
        """Delete all spatial masks for a dataset+grid pair."""
        return self._delete_group(
            f"metadata/{dataset_name}/{grid_name}/spatial_masks",
            f"spatial masks for {dataset_name}/{grid_name}",
        )

    def delete_statistics(self, dataset_name: str, grid_name: str) -> str:
        """Delete all statistics for a dataset+grid pair."""
        return self._delete_group(
            f"metadata/{dataset_name}/{grid_name}/statistics",
            f"statistics for {dataset_name}/{grid_name}",
        )

    def delete_all_metadata(self, dataset_name: str, grid_name: str) -> str:
        """Delete the entire metadata subtree for a dataset+grid pair."""
        return self._delete_group(
            f"metadata/{dataset_name}/{grid_name}",
            f"all metadata for {dataset_name}/{grid_name}",
        )

    def delete_specific_weight(
        self, dataset_name: str, grid_name: str, weight_name: str
    ) -> str:
        """Delete a single weight variable from an existing weights group.

        Parameters
        ----------
        dataset_name : str
            Dataset identifier.
        grid_name : str
            Grid identifier.
        weight_name : str
            Weight variable name (e.g. ``'observation_count'``).

        Returns
        -------
        str
            Snapshot ID.

        """
        group_path = f"metadata/{dataset_name}/{grid_name}/weights"
        logger.info("Deleting weight '%s' from %s", weight_name, group_path)

        with self.store.writable_session() as session:
            import zarr

            group = zarr.open(session.store, path=group_path, mode="r+")
            if weight_name in group:
                del group[weight_name]
                snapshot_id = session.commit(
                    f"Deleted weight '{weight_name}' for {dataset_name}/{grid_name}"
                )
                logger.info(
                    "Weight '%s' deleted (snapshot: %s)",
                    weight_name,
                    snapshot_id[:8],
                )
                return snapshot_id
            raise ValueError(f"Weight '{weight_name}' does not exist in {group_path}")

    def delete_specific_filter_mask(
        self, dataset_name: str, grid_name: str, filter_type: str
    ) -> str:
        """Delete a single filter mask variable from an existing filter_masks group.

        Parameters
        ----------
        dataset_name : str
            Dataset identifier.
        grid_name : str
            Grid identifier.
        filter_type : str
            Filter type name (e.g. ``'iqr'``).

        Returns
        -------
        str
            Snapshot ID.

        """
        group_path = f"metadata/{dataset_name}/{grid_name}/filter_masks"
        mask_var = f"mask_{filter_type}"
        logger.info("Deleting filter mask '%s' from %s", filter_type, group_path)

        with self.store.writable_session() as session:
            import zarr

            group = zarr.open(session.store, path=group_path, mode="r+")
            if mask_var in group:
                del group[mask_var]
                commit_msg = (
                    f"Deleted filter mask '{filter_type}' for "
                    f"{dataset_name}/{grid_name}"
                )
                snapshot_id = session.commit(commit_msg)
                logger.info(
                    "Filter mask '%s' deleted (snapshot: %s)",
                    filter_type,
                    snapshot_id[:8],
                )
                return snapshot_id
            raise ValueError(
                f"Filter mask '{filter_type}' does not exist in {group_path}"
            )

__init__(store_path)

Initialize the storage manager.

Parameters

store_path : Path | str Path to the VOD Icechunk store directory.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
77
78
79
80
81
82
83
84
85
86
87
def __init__(self, store_path: Path | str) -> None:
    """Initialize the storage manager.

    Parameters
    ----------
    store_path : Path | str
        Path to the VOD Icechunk store directory.

    """
    self.store_path = Path(store_path)
    self.store: MyIcechunkStore = _get_store(self.store_path)

__repr__()

Return the developer-facing representation.

Returns

str Representation string.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
89
90
91
92
93
94
95
96
97
98
def __repr__(self) -> str:
    """Return the developer-facing representation.

    Returns
    -------
    str
        Representation string.

    """
    return f"AnalysisStorage(store_path={self.store_path})"

store_weights(dataset_name, grid_name, weights, weight_params=None, overwrite=False)

Store per-cell weight arrays.

Parameters

dataset_name : str Dataset identifier (e.g. 'reference_01_canopy_01'). grid_name : str Grid identifier (e.g. 'equal_area_2deg'). weights : dict {name: array} – all arrays must have shape (ncells,). weight_params : dict, optional Parameters used to compute each weight type. overwrite : bool Overwrite existing weights.

Returns

str Icechunk snapshot ID.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
def store_weights(
    self,
    dataset_name: str,
    grid_name: str,
    weights: dict[str, np.ndarray],
    weight_params: dict[str, dict] | None = None,
    overwrite: bool = False,
) -> str:
    """Store per-cell weight arrays.

    Parameters
    ----------
    dataset_name : str
        Dataset identifier (e.g. ``'reference_01_canopy_01'``).
    grid_name : str
        Grid identifier (e.g. ``'equal_area_2deg'``).
    weights : dict
        ``{name: array}`` – all arrays must have shape ``(ncells,)``.
    weight_params : dict, optional
        Parameters used to compute each weight type.
    overwrite : bool
        Overwrite existing weights.

    Returns
    -------
    str
        Icechunk snapshot ID.

    """
    group_path = f"metadata/{dataset_name}/{grid_name}/weights"
    logger.info("Storing weights to %s", group_path)

    # Validate shapes
    ncells = len(next(iter(weights.values())))
    for name, arr in weights.items():
        if arr.shape != (ncells,):
            raise ValueError(
                f"Weight '{name}' has shape {arr.shape}, expected ({ncells},)"
            )

    # Build xarray dataset
    weight_vars = {
        name: (["cell"], arr.astype(np.float32)) for name, arr in weights.items()
    }
    ds_weights = xr.Dataset(
        weight_vars, coords={"cell": np.arange(ncells, dtype=np.int32)}
    )

    attrs: dict[str, Any] = {
        "created_at": datetime.now().isoformat(),
        "dataset": dataset_name,
        "grid": grid_name,
        "weight_types": list(weights.keys()),
        "ncells": ncells,
    }
    if weight_params:
        attrs["weight_parameters"] = str(weight_params)
    ds_weights.attrs.update(attrs)

    # Persist
    with self.store.writable_session() as session:
        from icechunk.xarray import to_icechunk

        mode = "w" if overwrite else "w-"
        to_icechunk(ds_weights, session, group=group_path, mode=mode)
        snapshot_id: str = session.commit(
            f"Stored weights for {dataset_name}/{grid_name}"
        )

    logger.info("Weights stored (snapshot: %s)", snapshot_id[:8])
    return snapshot_id

load_weights(dataset_name, grid_name, weight_type=None)

Load stored weight arrays.

Parameters

dataset_name : str Dataset identifier. grid_name : str Grid identifier. weight_type : str, optional Load only this weight. None loads all.

Returns

dict {name: ndarray} of loaded weights.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
def load_weights(
    self,
    dataset_name: str,
    grid_name: str,
    weight_type: str | None = None,
) -> dict[str, np.ndarray]:
    """Load stored weight arrays.

    Parameters
    ----------
    dataset_name : str
        Dataset identifier.
    grid_name : str
        Grid identifier.
    weight_type : str, optional
        Load only this weight.  ``None`` loads all.

    Returns
    -------
    dict
        ``{name: ndarray}`` of loaded weights.

    """
    group_path = f"metadata/{dataset_name}/{grid_name}/weights"

    try:
        with self.store.readonly_session() as session:
            ds_weights = xr.open_zarr(
                session.store, group=group_path, consolidated=False
            )

        if weight_type:
            if weight_type not in ds_weights:
                raise ValueError(
                    f"Weight '{weight_type}' not found. "
                    f"Available: {list(ds_weights.data_vars)}"
                )
            return {weight_type: ds_weights[weight_type].values}
        return {var: ds_weights[var].values for var in ds_weights.data_vars}

    except Exception:
        logger.error("Failed to load weights from %s", group_path, exc_info=True)
        raise

has_weights(dataset_name, grid_name)

Return True if weights exist for the dataset+grid pair.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
220
221
222
223
224
225
226
def has_weights(self, dataset_name: str, grid_name: str) -> bool:
    """Return ``True`` if weights exist for the dataset+grid pair."""
    try:
        self.load_weights(dataset_name, grid_name)
        return True
    except Exception:
        return False

store_filter_masks(dataset_name, grid_name, masks, filter_params=None, overwrite=False)

Store per-observation filter masks at native resolution.

Parameters

dataset_name : str Dataset identifier. grid_name : str Grid identifier. masks : dict {filter_name: DataArray} – all must share the same (epoch, sid) shape. filter_params : dict, optional Parameters used for each filter. overwrite : bool Overwrite existing masks.

Returns

str Icechunk snapshot ID.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
def store_filter_masks(
    self,
    dataset_name: str,
    grid_name: str,
    masks: dict[str, xr.DataArray],
    filter_params: dict[str, dict] | None = None,
    overwrite: bool = False,
) -> str:
    """Store per-observation filter masks at native resolution.

    Parameters
    ----------
    dataset_name : str
        Dataset identifier.
    grid_name : str
        Grid identifier.
    masks : dict
        ``{filter_name: DataArray}`` – all must share the same
        ``(epoch, sid)`` shape.
    filter_params : dict, optional
        Parameters used for each filter.
    overwrite : bool
        Overwrite existing masks.

    Returns
    -------
    str
        Icechunk snapshot ID.

    """
    group_path = f"metadata/{dataset_name}/{grid_name}/filter_masks"
    logger.info("Storing filter masks to %s", group_path)

    first_mask = next(iter(masks.values()))
    shape = first_mask.shape

    mask_vars: dict[str, xr.DataArray] = {}
    for name, mask_array in masks.items():
        if not isinstance(mask_array, xr.DataArray):
            raise TypeError(f"Mask '{name}' must be xr.DataArray")
        if mask_array.shape != shape:
            raise ValueError("All masks must have same shape")
        mask_vars[f"mask_{name}"] = mask_array.astype(np.int8)

    ds_masks = xr.Dataset(mask_vars)

    attrs: dict[str, Any] = {
        "created_at": datetime.now().isoformat(),
        "dataset": dataset_name,
        "grid": grid_name,
        "filter_types": list(masks.keys()),
        "shape": str(shape),
        "coordinate_source": f"/{dataset_name}/",
    }
    if filter_params:
        attrs["filter_parameters"] = str(filter_params)
    ds_masks.attrs.update(attrs)

    # Rechunk for efficient columnar storage
    logger.info("Rechunking masks for efficient storage")
    ds_masks = ds_masks.chunk({"epoch": 10000, "sid": -1})

    with self.store.writable_session() as session:
        import dask
        from icechunk.xarray import to_icechunk

        logger.info("Writing masks (this may take a few minutes)")
        with dask.config.set(scheduler="threads", num_workers=4):
            mode = "w" if overwrite else "w-"
            to_icechunk(ds_masks, session, group=group_path, mode=mode)

        logger.info("Committing")
        snapshot_id = session.commit(
            f"Stored filter masks for {dataset_name}/{grid_name}"
        )

    logger.info("Filter masks stored (snapshot: %s)", snapshot_id[:8])
    return snapshot_id

load_filter_mask(dataset_name, grid_name, filter_type, attach_coords=True)

Load a single filter mask.

Parameters

dataset_name : str Dataset identifier. grid_name : str Grid identifier. filter_type : str Filter name (e.g. 'iqr', 'zscore'). attach_coords : bool Re-attach epoch / sid coordinates from the source dataset group. Set False for faster loading when coordinates are not needed.

Returns

xr.DataArray Boolean mask with shape (epoch, sid).

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
def load_filter_mask(
    self,
    dataset_name: str,
    grid_name: str,
    filter_type: str,
    attach_coords: bool = True,
) -> xr.DataArray:
    """Load a single filter mask.

    Parameters
    ----------
    dataset_name : str
        Dataset identifier.
    grid_name : str
        Grid identifier.
    filter_type : str
        Filter name (e.g. ``'iqr'``, ``'zscore'``).
    attach_coords : bool
        Re-attach ``epoch`` / ``sid`` coordinates from the source
        dataset group.  Set ``False`` for faster loading when
        coordinates are not needed.

    Returns
    -------
    xr.DataArray
        Boolean mask with shape ``(epoch, sid)``.

    """
    group_path = f"metadata/{dataset_name}/{grid_name}/filter_masks"

    try:
        with self.store.readonly_session() as session:
            ds_masks = xr.open_zarr(
                session.store, group=group_path, consolidated=False
            )

        mask_var = f"mask_{filter_type}"
        if mask_var not in ds_masks:
            available = [v.replace("mask_", "") for v in ds_masks.data_vars]
            raise ValueError(
                f"Filter mask '{filter_type}' not found. Available: {available}"
            )

        mask = ds_masks[mask_var].astype(bool)

        if attach_coords:
            coord_source = ds_masks.attrs.get(
                "coordinate_source", f"/{dataset_name}/"
            )
            with self.store.readonly_session() as session:
                ds_source = xr.open_zarr(
                    session.store,
                    group=coord_source.strip("/"),
                    consolidated=False,
                )

            mask = mask.assign_coords(
                {"epoch": ds_source["epoch"], "sid": ds_source["sid"]}
            )
            for coord in [
                "band",
                "code",
                "sv",
                "system",
                "freq_min",
                "freq_max",
                "freq_center",
            ]:
                if coord in ds_source.coords:
                    mask = mask.assign_coords({coord: ds_source[coord]})

        return mask

    except Exception:
        logger.error("Failed to load filter mask", exc_info=True)
        raise

load_all_filter_masks(dataset_name, grid_name)

Load all stored filter masks.

Returns

dict {filter_name: DataArray} – boolean masks.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
def load_all_filter_masks(
    self, dataset_name: str, grid_name: str
) -> dict[str, xr.DataArray]:
    """Load all stored filter masks.

    Returns
    -------
    dict
        ``{filter_name: DataArray}`` – boolean masks.

    """
    group_path = f"metadata/{dataset_name}/{grid_name}/filter_masks"

    try:
        with self.store.readonly_session() as session:
            ds_masks = xr.open_zarr(
                session.store, group=group_path, consolidated=False
            )
        return {
            var.replace("mask_", ""): ds_masks[var].astype(bool)
            for var in ds_masks.data_vars
        }
    except Exception:
        logger.error(
            "Failed to load filter masks from %s", group_path, exc_info=True
        )
        raise

has_filter_masks(dataset_name, grid_name)

Return True if filter masks exist for the dataset+grid pair.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
416
417
418
419
420
421
422
def has_filter_masks(self, dataset_name: str, grid_name: str) -> bool:
    """Return ``True`` if filter masks exist for the dataset+grid pair."""
    try:
        self.load_all_filter_masks(dataset_name, grid_name)
        return True
    except Exception:
        return False

store_spatial_masks(dataset_name, grid_name, masks, mask_descriptions=None, overwrite=False)

Store per-cell geometric selection masks.

Parameters

dataset_name : str Dataset identifier. grid_name : str Grid identifier. masks : dict {name: bool_array} – all arrays must have shape (ncells,) and boolean dtype. mask_descriptions : dict, optional Human-readable description for each mask (stored as variable attrs). overwrite : bool Overwrite existing masks.

Returns

str Icechunk snapshot ID.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
def store_spatial_masks(
    self,
    dataset_name: str,
    grid_name: str,
    masks: dict[str, np.ndarray],
    mask_descriptions: dict[str, str] | None = None,
    overwrite: bool = False,
) -> str:
    """Store per-cell geometric selection masks.

    Parameters
    ----------
    dataset_name : str
        Dataset identifier.
    grid_name : str
        Grid identifier.
    masks : dict
        ``{name: bool_array}`` – all arrays must have shape ``(ncells,)``
        and boolean dtype.
    mask_descriptions : dict, optional
        Human-readable description for each mask (stored as variable attrs).
    overwrite : bool
        Overwrite existing masks.

    Returns
    -------
    str
        Icechunk snapshot ID.

    """
    group_path = f"metadata/{dataset_name}/{grid_name}/spatial_masks"
    logger.info("Storing spatial masks to %s", group_path)

    ncells = len(next(iter(masks.values())))
    for name, arr in masks.items():
        if arr.shape != (ncells,):
            raise ValueError(
                f"Mask '{name}' has shape {arr.shape}, expected ({ncells},)"
            )
        if arr.dtype != bool:
            raise ValueError(
                f"Mask '{name}' must be boolean dtype, got {arr.dtype}"
            )

    mask_vars = {
        f"mask_{name}": (["cell"], arr.astype(np.int8))
        for name, arr in masks.items()
    }
    ds_masks = xr.Dataset(
        mask_vars, coords={"cell": np.arange(ncells, dtype=np.int32)}
    )

    # Per-variable metadata
    for name, arr in masks.items():
        n_sel = int(arr.sum())
        ds_masks[f"mask_{name}"].attrs["n_cells_selected"] = n_sel
        ds_masks[f"mask_{name}"].attrs["fraction_selected"] = float(n_sel / ncells)
    if mask_descriptions:
        for name, desc in mask_descriptions.items():
            if name in masks:
                ds_masks[f"mask_{name}"].attrs["description"] = desc

    ds_masks.attrs.update(
        {
            "created_at": datetime.now().isoformat(),
            "dataset": dataset_name,
            "grid": grid_name,
            "mask_types": list(masks.keys()),
            "ncells": ncells,
        }
    )

    with self.store.writable_session() as session:
        from icechunk.xarray import to_icechunk

        mode = "w" if overwrite else "w-"
        to_icechunk(ds_masks, session, group=group_path, mode=mode)
        snapshot_id = session.commit(
            f"Stored spatial masks for {dataset_name}/{grid_name}"
        )

    logger.info("Spatial masks stored (snapshot: %s)", snapshot_id[:8])
    return snapshot_id

load_spatial_mask(dataset_name, grid_name, mask_name)

Load a single spatial mask.

Parameters

dataset_name : str Dataset identifier. grid_name : str Grid identifier. mask_name : str Mask name (e.g. 'north', 'high_elevation').

Returns

np.ndarray Boolean array with shape (ncells,).

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
def load_spatial_mask(
    self, dataset_name: str, grid_name: str, mask_name: str
) -> np.ndarray:
    """Load a single spatial mask.

    Parameters
    ----------
    dataset_name : str
        Dataset identifier.
    grid_name : str
        Grid identifier.
    mask_name : str
        Mask name (e.g. ``'north'``, ``'high_elevation'``).

    Returns
    -------
    np.ndarray
        Boolean array with shape ``(ncells,)``.

    """
    group_path = f"metadata/{dataset_name}/{grid_name}/spatial_masks"

    try:
        with self.store.readonly_session() as session:
            ds_masks = xr.open_zarr(
                session.store, group=group_path, consolidated=False
            )

        mask_var = f"mask_{mask_name}"
        if mask_var not in ds_masks:
            available = [v.replace("mask_", "") for v in ds_masks.data_vars]
            raise ValueError(
                f"Spatial mask '{mask_name}' not found. Available: {available}"
            )
        return ds_masks[mask_var].values.astype(bool)

    except Exception:
        logger.error(
            "Failed to load spatial mask from %s", group_path, exc_info=True
        )
        raise

load_all_spatial_masks(dataset_name, grid_name)

Load all spatial masks.

Returns

dict {name: bool_ndarray}.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
def load_all_spatial_masks(
    self, dataset_name: str, grid_name: str
) -> dict[str, np.ndarray]:
    """Load all spatial masks.

    Returns
    -------
    dict
        ``{name: bool_ndarray}``.

    """
    group_path = f"metadata/{dataset_name}/{grid_name}/spatial_masks"

    try:
        with self.store.readonly_session() as session:
            ds_masks = xr.open_zarr(
                session.store, group=group_path, consolidated=False
            )
        return {
            var.replace("mask_", ""): ds_masks[var].values.astype(bool)
            for var in ds_masks.data_vars
        }
    except Exception:
        logger.error(
            "Failed to load spatial masks from %s", group_path, exc_info=True
        )
        raise

has_spatial_masks(dataset_name, grid_name)

Return True if spatial masks exist for the dataset+grid pair.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
582
583
584
585
586
587
588
def has_spatial_masks(self, dataset_name: str, grid_name: str) -> bool:
    """Return ``True`` if spatial masks exist for the dataset+grid pair."""
    try:
        self.load_all_spatial_masks(dataset_name, grid_name)
        return True
    except Exception:
        return False

store_statistics(dataset_name, grid_name, stats, overwrite=False)

Store pre-computed per-cell statistics.

Parameters

dataset_name : str Dataset identifier. grid_name : str Grid identifier. stats : dict {name: array} – all arrays must have shape (ncells,). Variables whose name ends with '_count' or equals 'obs_count' are stored as int64; everything else as float32. overwrite : bool Overwrite existing statistics.

Returns

str Icechunk snapshot ID.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
def store_statistics(
    self,
    dataset_name: str,
    grid_name: str,
    stats: dict[str, np.ndarray],
    overwrite: bool = False,
) -> str:
    """Store pre-computed per-cell statistics.

    Parameters
    ----------
    dataset_name : str
        Dataset identifier.
    grid_name : str
        Grid identifier.
    stats : dict
        ``{name: array}`` – all arrays must have shape ``(ncells,)``.
        Variables whose name ends with ``'_count'`` or equals
        ``'obs_count'`` are stored as ``int64``; everything else as
        ``float32``.
    overwrite : bool
        Overwrite existing statistics.

    Returns
    -------
    str
        Icechunk snapshot ID.

    """
    group_path = f"metadata/{dataset_name}/{grid_name}/statistics"
    logger.info("Storing statistics to %s", group_path)

    ncells = len(next(iter(stats.values())))
    for name, arr in stats.items():
        if arr.shape != (ncells,):
            raise ValueError(
                f"Statistic '{name}' has shape {arr.shape}, expected ({ncells},)"
            )

    stat_vars = {}
    for name, arr in stats.items():
        dtype = (
            np.int64
            if (name.endswith("_count") or name == "obs_count")
            else np.float32
        )
        stat_vars[name] = (["cell"], arr.astype(dtype))

    ds_stats = xr.Dataset(
        stat_vars, coords={"cell": np.arange(ncells, dtype=np.int32)}
    )

    # Summary statistics per variable (stored as variable attrs)
    for name, arr in stats.items():
        valid = np.isfinite(arr)
        if np.any(valid):
            ds_stats[name].attrs.update(
                {
                    "min": float(np.nanmin(arr)),
                    "max": float(np.nanmax(arr)),
                    "mean": float(np.nanmean(arr)),
                    "n_valid": int(np.sum(valid)),
                }
            )

    ds_stats.attrs.update(
        {
            "created_at": datetime.now().isoformat(),
            "dataset": dataset_name,
            "grid": grid_name,
            "statistics": list(stats.keys()),
            "ncells": ncells,
        }
    )

    with self.store.writable_session() as session:
        from icechunk.xarray import to_icechunk

        mode = "w" if overwrite else "w-"
        to_icechunk(ds_stats, session, group=group_path, mode=mode)
        snapshot_id = session.commit(
            f"Stored statistics for {dataset_name}/{grid_name}"
        )

    logger.info("Statistics stored (snapshot: %s)", snapshot_id[:8])
    return snapshot_id

load_statistics(dataset_name, grid_name, stat_name=None)

Load pre-computed per-cell statistics.

Parameters

dataset_name : str Dataset identifier. grid_name : str Grid identifier. stat_name : str, optional Load only this statistic. None loads all.

Returns

dict {name: ndarray}.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
def load_statistics(
    self,
    dataset_name: str,
    grid_name: str,
    stat_name: str | None = None,
) -> dict[str, np.ndarray]:
    """Load pre-computed per-cell statistics.

    Parameters
    ----------
    dataset_name : str
        Dataset identifier.
    grid_name : str
        Grid identifier.
    stat_name : str, optional
        Load only this statistic.  ``None`` loads all.

    Returns
    -------
    dict
        ``{name: ndarray}``.

    """
    group_path = f"metadata/{dataset_name}/{grid_name}/statistics"

    try:
        with self.store.readonly_session() as session:
            ds_stats = xr.open_zarr(
                session.store, group=group_path, consolidated=False
            )

        if stat_name:
            if stat_name not in ds_stats:
                raise ValueError(
                    f"Statistic '{stat_name}' not found. "
                    f"Available: {list(ds_stats.data_vars)}"
                )
            return {stat_name: ds_stats[stat_name].values}
        return {var: ds_stats[var].values for var in ds_stats.data_vars}

    except Exception:
        logger.error("Failed to load statistics from %s", group_path, exc_info=True)
        raise

has_statistics(dataset_name, grid_name)

Return True if statistics exist for the dataset+grid pair.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
725
726
727
728
729
730
731
def has_statistics(self, dataset_name: str, grid_name: str) -> bool:
    """Return ``True`` if statistics exist for the dataset+grid pair."""
    try:
        self.load_statistics(dataset_name, grid_name)
        return True
    except Exception:
        return False

list_available_metadata(dataset_name, grid_name)

Check which metadata categories are stored.

Returns

dict {category: bool} for weights, filter_masks, spatial_masks, statistics.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
def list_available_metadata(
    self, dataset_name: str, grid_name: str
) -> dict[str, bool]:
    """Check which metadata categories are stored.

    Returns
    -------
    dict
        ``{category: bool}`` for weights, filter_masks, spatial_masks,
        statistics.

    """
    return {
        "weights": self.has_weights(dataset_name, grid_name),
        "filter_masks": self.has_filter_masks(dataset_name, grid_name),
        "spatial_masks": self.has_spatial_masks(dataset_name, grid_name),
        "statistics": self.has_statistics(dataset_name, grid_name),
    }

get_metadata_summary(dataset_name, grid_name)

Detailed summary of all stored metadata for a dataset+grid pair.

Returns

dict Nested summary with availability flags and per-category details.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
def get_metadata_summary(self, dataset_name: str, grid_name: str) -> dict[str, Any]:
    """Detailed summary of all stored metadata for a dataset+grid pair.

    Returns
    -------
    dict
        Nested summary with availability flags and per-category details.

    """
    summary: dict[str, Any] = {
        "dataset": dataset_name,
        "grid": grid_name,
        "available": self.list_available_metadata(dataset_name, grid_name),
    }

    if summary["available"]["weights"]:
        weights = self.load_weights(dataset_name, grid_name)
        summary["weights"] = {
            "types": list(weights.keys()),
            "ncells": len(next(iter(weights.values()))),
        }

    if summary["available"]["filter_masks"]:
        masks = self.load_all_filter_masks(dataset_name, grid_name)
        summary["filter_masks"] = {
            "types": list(masks.keys()),
            "shape": next(iter(masks.values())).shape,
        }

    if summary["available"]["spatial_masks"]:
        masks = self.load_all_spatial_masks(dataset_name, grid_name)
        summary["spatial_masks"] = {
            "types": list(masks.keys()),
            "ncells": len(next(iter(masks.values()))),
        }

    if summary["available"]["statistics"]:
        stats = self.load_statistics(dataset_name, grid_name)
        summary["statistics"] = {
            "types": list(stats.keys()),
            "ncells": len(next(iter(stats.values()))),
        }

    return summary

delete_weights(dataset_name, grid_name)

Delete all weights for a dataset+grid pair.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
839
840
841
842
843
844
def delete_weights(self, dataset_name: str, grid_name: str) -> str:
    """Delete all weights for a dataset+grid pair."""
    return self._delete_group(
        f"metadata/{dataset_name}/{grid_name}/weights",
        f"weights for {dataset_name}/{grid_name}",
    )

delete_filter_masks(dataset_name, grid_name)

Delete all filter masks for a dataset+grid pair.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
846
847
848
849
850
851
def delete_filter_masks(self, dataset_name: str, grid_name: str) -> str:
    """Delete all filter masks for a dataset+grid pair."""
    return self._delete_group(
        f"metadata/{dataset_name}/{grid_name}/filter_masks",
        f"filter masks for {dataset_name}/{grid_name}",
    )

delete_spatial_masks(dataset_name, grid_name)

Delete all spatial masks for a dataset+grid pair.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
853
854
855
856
857
858
def delete_spatial_masks(self, dataset_name: str, grid_name: str) -> str:
    """Delete all spatial masks for a dataset+grid pair."""
    return self._delete_group(
        f"metadata/{dataset_name}/{grid_name}/spatial_masks",
        f"spatial masks for {dataset_name}/{grid_name}",
    )

delete_statistics(dataset_name, grid_name)

Delete all statistics for a dataset+grid pair.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
860
861
862
863
864
865
def delete_statistics(self, dataset_name: str, grid_name: str) -> str:
    """Delete all statistics for a dataset+grid pair."""
    return self._delete_group(
        f"metadata/{dataset_name}/{grid_name}/statistics",
        f"statistics for {dataset_name}/{grid_name}",
    )

delete_all_metadata(dataset_name, grid_name)

Delete the entire metadata subtree for a dataset+grid pair.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
867
868
869
870
871
872
def delete_all_metadata(self, dataset_name: str, grid_name: str) -> str:
    """Delete the entire metadata subtree for a dataset+grid pair."""
    return self._delete_group(
        f"metadata/{dataset_name}/{grid_name}",
        f"all metadata for {dataset_name}/{grid_name}",
    )

delete_specific_weight(dataset_name, grid_name, weight_name)

Delete a single weight variable from an existing weights group.

Parameters

dataset_name : str Dataset identifier. grid_name : str Grid identifier. weight_name : str Weight variable name (e.g. 'observation_count').

Returns

str Snapshot ID.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
def delete_specific_weight(
    self, dataset_name: str, grid_name: str, weight_name: str
) -> str:
    """Delete a single weight variable from an existing weights group.

    Parameters
    ----------
    dataset_name : str
        Dataset identifier.
    grid_name : str
        Grid identifier.
    weight_name : str
        Weight variable name (e.g. ``'observation_count'``).

    Returns
    -------
    str
        Snapshot ID.

    """
    group_path = f"metadata/{dataset_name}/{grid_name}/weights"
    logger.info("Deleting weight '%s' from %s", weight_name, group_path)

    with self.store.writable_session() as session:
        import zarr

        group = zarr.open(session.store, path=group_path, mode="r+")
        if weight_name in group:
            del group[weight_name]
            snapshot_id = session.commit(
                f"Deleted weight '{weight_name}' for {dataset_name}/{grid_name}"
            )
            logger.info(
                "Weight '%s' deleted (snapshot: %s)",
                weight_name,
                snapshot_id[:8],
            )
            return snapshot_id
        raise ValueError(f"Weight '{weight_name}' does not exist in {group_path}")

delete_specific_filter_mask(dataset_name, grid_name, filter_type)

Delete a single filter mask variable from an existing filter_masks group.

Parameters

dataset_name : str Dataset identifier. grid_name : str Grid identifier. filter_type : str Filter type name (e.g. 'iqr').

Returns

str Snapshot ID.

Source code in packages/canvod-grids/src/canvod/grids/analysis/analysis_storage.py
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
def delete_specific_filter_mask(
    self, dataset_name: str, grid_name: str, filter_type: str
) -> str:
    """Delete a single filter mask variable from an existing filter_masks group.

    Parameters
    ----------
    dataset_name : str
        Dataset identifier.
    grid_name : str
        Grid identifier.
    filter_type : str
        Filter type name (e.g. ``'iqr'``).

    Returns
    -------
    str
        Snapshot ID.

    """
    group_path = f"metadata/{dataset_name}/{grid_name}/filter_masks"
    mask_var = f"mask_{filter_type}"
    logger.info("Deleting filter mask '%s' from %s", filter_type, group_path)

    with self.store.writable_session() as session:
        import zarr

        group = zarr.open(session.store, path=group_path, mode="r+")
        if mask_var in group:
            del group[mask_var]
            commit_msg = (
                f"Deleted filter mask '{filter_type}' for "
                f"{dataset_name}/{grid_name}"
            )
            snapshot_id = session.commit(commit_msg)
            logger.info(
                "Filter mask '%s' deleted (snapshot: %s)",
                filter_type,
                snapshot_id[:8],
            )
            return snapshot_id
        raise ValueError(
            f"Filter mask '{filter_type}' does not exist in {group_path}"
        )

Workflows

Core adapted VOD workflow.

Orchestrates the full VOD analysis pipeline: loading data from an Icechunk store, applying cell-SID Hampel filtering (vectorised or parallelised), and persisting results back to a processing branch.

Classes

AdaptedVODWorkflow Main workflow entry-point. Lazily imports canvod-store so the package can be imported even when the store is not installed.

Functions

get_workflow_for_store Convenience factory. check_processed_data_status Introspect the store for previously filtered data.

AdaptedVODWorkflow

Core VOD analysis workflow with polars-optimised loading and refined temporal matching.

All heavy lifting (filtering, grid operations) is delegated to canvod.grids.analysis. This class is responsible only for Icechunk I/O and orchestration.

Parameters

vod_store_path : Path or str Path to the VOD Icechunk store directory.

Source code in packages/canvod-grids/src/canvod/grids/workflows/adapted_workflow.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
class AdaptedVODWorkflow:
    """Core VOD analysis workflow with polars-optimised loading and refined
    temporal matching.

    All heavy lifting (filtering, grid operations) is delegated to
    ``canvod.grids.analysis``.  This class is responsible only for
    Icechunk I/O and orchestration.

    Parameters
    ----------
    vod_store_path : Path or str
        Path to the VOD Icechunk store directory.

    """

    def __init__(self, vod_store_path: Path | str) -> None:
        """Initialize the workflow.

        Parameters
        ----------
        vod_store_path : Path | str
            Path to the VOD Icechunk store directory.

        """
        self.vod_store_path = Path(vod_store_path)
        self.vod_store: MyIcechunkStore = _get_store(self.vod_store_path)

    # ------------------------------------------------------------------
    # Data loading
    # ------------------------------------------------------------------

    def load_vod_data(
        self,
        group_name: str = "reference_01_canopy_01",
        branch: str = "main",
    ) -> xr.Dataset:
        """Load a VOD dataset from the store.

        Parameters
        ----------
        group_name : str
            Zarr group path inside the store.
        branch : str
            Icechunk branch to read from.

        Returns
        -------
        xr.Dataset
            Lazy-loaded VOD dataset.

        """
        logger.info("Loading VOD data from branch=%s group=%s", branch, group_name)
        with self.vod_store.readonly_session(branch=branch) as session:
            vod_ds = xr.open_zarr(session.store, group=group_name, consolidated=False)
        logger.info("Loaded VOD dataset: %s", dict(vod_ds.sizes))
        return vod_ds

    # ------------------------------------------------------------------
    # Temporal coverage checks
    # ------------------------------------------------------------------

    def check_temporal_coverage_compatibility(
        self,
        main_ds: xr.Dataset,
        processed_ds: xr.Dataset,
        requested_time_range: tuple[datetime.date, datetime.date] | None = None,
    ) -> tuple[bool, dict[str, Any]]:
        """Check whether *processed_ds* adequately covers a time range.

        When *requested_time_range* is ``None`` the method checks that the
        processed dataset covers at least 70 % of the main dataset's span.
        When a range is given it verifies that both endpoints fall within the
        processed dataset (with a 1-day tolerance).

        Parameters
        ----------
        main_ds : xr.Dataset
            Reference (unfiltered) dataset.
        processed_ds : xr.Dataset
            Filtered dataset to validate.
        requested_time_range : tuple of date, optional
            ``(start, end)`` to check against.

        Returns
        -------
        compatible : bool
        coverage_info : dict
            Diagnostic information with ``main_range``, ``processed_range``,
            and ``requested_range``.

        """

        def _date_range(ds: xr.Dataset) -> tuple[datetime.date, datetime.date]:
            """Return the date range for a dataset.

            Parameters
            ----------
            ds : xr.Dataset
                Dataset with an epoch coordinate.

            Returns
            -------
            tuple[datetime.date, datetime.date]
                Start and end dates.

            """
            return (
                pd.to_datetime(ds.epoch.min().values).date(),
                pd.to_datetime(ds.epoch.max().values).date(),
            )

        main_start, main_end = _date_range(main_ds)
        proc_start, proc_end = _date_range(processed_ds)

        coverage_info: dict[str, Any] = {
            "main_range": (main_start, main_end),
            "processed_range": (proc_start, proc_end),
            "requested_range": requested_time_range,
        }

        if requested_time_range is None:
            main_days = (main_end - main_start).days
            proc_days = (proc_end - proc_start).days
            ratio = proc_days / main_days if main_days > 0 else 0.0
            logger.info(
                "Coverage check: main=%d days, processed=%d days, ratio=%.1f%%",
                main_days,
                proc_days,
                ratio * 100,
            )
            return ratio >= 0.7, coverage_info

        req_start, req_end = requested_time_range
        one_day = datetime.timedelta(days=1)
        start_ok = proc_start <= req_start <= proc_end + one_day
        end_ok = proc_start - one_day <= req_end <= proc_end
        compatible = start_ok and end_ok

        if not compatible:
            logger.warning(
                "Temporal coverage mismatch: processed=%s%s, requested=%s%s",
                proc_start,
                proc_end,
                req_start,
                req_end,
            )
        return compatible, coverage_info

    # ------------------------------------------------------------------
    # Filtering entry-points
    # ------------------------------------------------------------------

    def create_processed_data_fast_hampel_complete(
        self,
        start_date: datetime.date | datetime.datetime,
        end_date: datetime.date | datetime.datetime,
        force_recreate: bool = False,
        window_hours: float = 1.0,
        sigma_threshold: float = 3.0,
        min_points: int = 5,
        ultra_fast_mode: bool = False,
        cell_batch_size: int = 200,
        n_workers: int | None = None,
    ) -> str | None:
        """Run the vectorised / ultra-fast Hampel pipeline end-to-end.

        Delegates the actual filtering to
        :func:`canvod.grids.analysis.sigma_clip_filter.astropy_hampel_vectorized_fast`
        (or its ultra-fast variant) and persists the result on a
        ``processing`` branch.

        Parameters
        ----------
        start_date, end_date : date or datetime
            Temporal extent to process.
        force_recreate : bool
            Overwrite existing filtered data.
        window_hours : float
            Hampel temporal window in hours.
        sigma_threshold : float
            MAD-based outlier threshold.
        min_points : int
            Minimum observations required per window.
        ultra_fast_mode : bool
            Use the pure-NumPy sigma-clip path (faster, less precise).
        cell_batch_size : int
            Number of cells per spatial batch.
        n_workers : int, optional
            Parallel workers.  ``None`` → auto-detect.

        Returns
        -------
        str or None
            Icechunk snapshot ID, or ``None`` if existing data was kept.

        """
        return _create_processed_data_fast_hampel(
            workflow_instance=self,
            start_date=start_date,
            end_date=end_date,
            force_recreate=force_recreate,
            window_hours=window_hours,
            sigma_threshold=sigma_threshold,
            min_points=min_points,
            ultra_fast_mode=ultra_fast_mode,
            cell_batch_size=cell_batch_size,
            n_workers=n_workers,
        )

    def create_processed_data_hampel_parallel_complete(
        self,
        start_date: datetime.date | datetime.datetime,
        end_date: datetime.date | datetime.datetime,
        force_recreate: bool = False,
        threshold: float = 3.0,
        min_obs_per_sid: int = 20,
        spatial_batch_size: int = 500,
        n_workers: int | None = None,
        temporal_agg: str | None = None,
        agg_method: str | None = None,
    ) -> str | None:
        """Run the parallelised cell-SID Hampel pipeline end-to-end.

        Loads the complete requested time range (no temporal chunking) and
        applies
        :func:`canvod.grids.analysis.hampel_filtering.aggr_hampel_cell_sid_parallelized`
        with spatial batching.

        Parameters
        ----------
        start_date, end_date : date or datetime
            Temporal extent to process.
        force_recreate : bool
            Overwrite existing filtered data.
        threshold : float
            MAD-based outlier threshold.
        min_obs_per_sid : int
            Minimum observations per cell-SID combination.
        spatial_batch_size : int
            Cells per spatial batch.
        n_workers : int, optional
            Parallel workers.  ``None`` → auto-detect.
        temporal_agg : str, optional
            Post-filtering aggregation frequency (e.g. ``'1H'``, ``'1D'``).
        agg_method : str, optional
            Aggregation method (e.g. ``'mean'``).

        Returns
        -------
        str or None
            Icechunk snapshot ID, or ``None`` if existing data was kept.

        """
        from canvod.grids import create_hemigrid
        from canvod.grids.analysis.hampel_filtering import (
            aggr_hampel_cell_sid_parallelized,
        )
        from canvod.grids.operations import add_cell_ids_to_ds_fast

        logger.info("=" * 60)
        logger.info("PARALLEL HAMPEL — complete temporal coverage")
        logger.info(
            "Range: %s%s | threshold=%.1f | min_obs=%d | batch=%d | workers=%s",
            start_date,
            end_date,
            threshold,
            min_obs_per_sid,
            spatial_batch_size,
            n_workers or "auto",
        )

        # --- guard: existing data ---
        if not self._force_or_skip("processing", force_recreate):
            return None

        # --- load complete time range ---
        logger.info("Loading complete time range for parallel processing")
        with self.vod_store.readonly_session(branch="main") as session:
            vod_ds = xr.open_zarr(session.store, group="reference_01_canopy_01")

        vod_ds_complete = vod_ds.sel(epoch=slice(start_date, end_date))

        if "cell_id_equal_area_2deg" not in vod_ds_complete:
            grid = create_hemigrid(grid_type="equal_area", angular_resolution=2)
            vod_ds_complete = add_cell_ids_to_ds_fast(
                vod_ds_complete, grid, "equal_area_2deg", data_var="VOD"
            )

        logger.info("Dataset loaded: %s", dict(vod_ds_complete.sizes))

        # --- filter ---
        t0 = time.time()
        vod_ds_filtered = aggr_hampel_cell_sid_parallelized(
            vod_ds_complete,
            threshold=threshold,
            min_obs_per_sid=min_obs_per_sid,
            spatial_batch_size=spatial_batch_size,
            n_workers=n_workers,
            temporal_agg=temporal_agg,
            agg_method=agg_method,
        )
        logger.info("Parallel filtering completed in %.1f s", time.time() - t0)

        # --- persist ---
        snapshot_id = self._persist_filtered(
            vod_ds_filtered,
            "parallel Cell-SID Hampel",
        )
        logger.info("Parallel Hampel complete. Snapshot: %s", snapshot_id)
        return snapshot_id

    # ------------------------------------------------------------------
    # High-level orchestration
    # ------------------------------------------------------------------

    def run_complete_workflow(
        self,
        group_name: str = "reference_01_canopy_01",
        branch: str = "auto",
        time_range: tuple[datetime.date, datetime.date] | None = None,
        **kwargs: Any,
    ) -> dict[str, Any]:
        """Orchestrate a complete analysis run.

        Auto-detection logic (``branch='auto'``) looks for Hampel-filtered
        data on the ``processing`` branch first.  If found and temporally
        compatible it is used directly; otherwise raw data from ``main`` is
        returned.

        Parameters
        ----------
        group_name : str
            Zarr group for the raw VOD data.
        branch : str
            ``'auto'`` for detection, or an explicit branch name.
        time_range : tuple of date, optional
            ``(start, end)`` to select.

        Returns
        -------
        dict
            Keys: ``final_data`` (Dataset), ``source_branch``,
            ``pre_filtered`` (bool), ``filter_type``.

        """
        logger.info("=" * 60)
        logger.info("HAMPEL-FILTERED VOD ANALYSIS WORKFLOW")
        logger.info("branch=%s group=%s time_range=%s", branch, group_name, time_range)

        results: dict[str, Any] = {}

        if branch == "auto":
            hampel_ds = self._try_load_hampel()

            if hampel_ds is not None:
                # Validate temporal coverage when a range is requested
                if time_range is not None:
                    dataset_start = pd.to_datetime(hampel_ds.epoch.min().values).date()
                    dataset_end = pd.to_datetime(hampel_ds.epoch.max().values).date()

                    start_ok = normalize_datetime_for_comparison(
                        time_range[0]
                    ) >= normalize_datetime_for_comparison(dataset_start)
                    end_ok = normalize_datetime_for_comparison(
                        time_range[1]
                    ) <= normalize_datetime_for_comparison(dataset_end)

                    if start_ok and end_ok:
                        hampel_ds = hampel_ds.sel(
                            epoch=slice(time_range[0], time_range[1])
                        )
                    else:
                        logger.warning(
                            "Hampel data (%s%s) does not cover requested "
                            "range (%s%s); falling back to main branch",
                            dataset_start,
                            dataset_end,
                            time_range[0],
                            time_range[1],
                        )
                        hampel_ds = None

                if hampel_ds is not None:
                    logger.info("Using Hampel filtered data from processing branch")
                    return {
                        "final_data": hampel_ds,
                        "source_branch": "processing",
                        "pre_filtered": True,
                        "filter_type": "hampel",
                    }

            logger.info("No usable Hampel data found; using raw data from main branch")
            branch = "main"

        # --- main branch (raw) ---
        logger.info("Loading raw data from branch=%s", branch)
        vod_ds = self.load_vod_data(group_name, branch)

        if time_range is not None:
            vod_ds = vod_ds.sel(epoch=slice(time_range[0], time_range[1]))

        results = {
            "final_data": vod_ds,
            "source_branch": branch,
            "pre_filtered": False,
            "filter_type": "none",
        }
        logger.info("Workflow complete — source=%s", branch)
        return results

    # ------------------------------------------------------------------
    # Private helpers
    # ------------------------------------------------------------------

    def _try_load_hampel(self) -> xr.Dataset | None:
        """Attempt to load Hampel-filtered data from the processing branch."""
        try:
            with self.vod_store.readonly_session(branch="processing") as session:
                ds = xr.open_zarr(
                    session.store,
                    group="reference_01_canopy_01_hampel_filtered",
                    consolidated=False,
                )
            logger.info("Found Hampel filtered data on processing branch")
            return ds
        except Exception:
            logger.debug("No Hampel data on processing branch", exc_info=True)
            return None

    def _force_or_skip(self, branch: str, force_recreate: bool) -> bool:
        """Guard pattern: return ``True`` to proceed, ``False`` to skip.

        If filtered data already exists and *force_recreate* is ``False``
        the method logs a warning and returns ``False``.  When
        *force_recreate* is ``True`` it deletes the branch first.
        """
        exists = self._try_load_hampel() is not None
        if exists and not force_recreate:
            logger.warning(
                "Filtered data already exists. Pass force_recreate=True to overwrite."
            )
            return False
        if exists and force_recreate:
            try:
                self.vod_store.delete_branch(branch)
                logger.info("Deleted existing %s branch", branch)
            except Exception:
                logger.warning("Could not delete branch %s", branch, exc_info=True)
        return True

    def _persist_filtered(
        self,
        ds: xr.Dataset,
        label: str,
        target_group: str = "reference_01_canopy_01_hampel_filtered",
    ) -> str:
        """Write a filtered dataset to the ``processing`` branch.

        Rechunks variables along the ``epoch`` dimension (max 50 000 epochs
        per chunk) before writing.

        Returns the Icechunk snapshot ID.
        """
        from icechunk.xarray import to_icechunk

        # Ensure processing branch exists
        try:
            current_snapshot = next(self.vod_store.repo.ancestry(branch="main")).id
            self.vod_store.repo.create_branch("processing", current_snapshot)
        except Exception:
            pass  # branch may already exist

        with self.vod_store.writable_session("processing") as session:
            logger.info("Persisting filtered data (%s)", label)
            for var_name in ds.data_vars:
                if "epoch" in ds[var_name].dims:
                    epoch_size = ds[var_name].sizes["epoch"]
                    ds[var_name] = ds[var_name].chunk(
                        {"epoch": min(epoch_size, 50000), "sid": -1}
                    )
            to_icechunk(ds, session, group=target_group, mode="w", safe_chunks=False)
            snapshot_id: str = session.commit(label)

        return snapshot_id

__init__(vod_store_path)

Initialize the workflow.

Parameters

vod_store_path : Path | str Path to the VOD Icechunk store directory.

Source code in packages/canvod-grids/src/canvod/grids/workflows/adapted_workflow.py
 96
 97
 98
 99
100
101
102
103
104
105
106
def __init__(self, vod_store_path: Path | str) -> None:
    """Initialize the workflow.

    Parameters
    ----------
    vod_store_path : Path | str
        Path to the VOD Icechunk store directory.

    """
    self.vod_store_path = Path(vod_store_path)
    self.vod_store: MyIcechunkStore = _get_store(self.vod_store_path)

load_vod_data(group_name='reference_01_canopy_01', branch='main')

Load a VOD dataset from the store.

Parameters

group_name : str Zarr group path inside the store. branch : str Icechunk branch to read from.

Returns

xr.Dataset Lazy-loaded VOD dataset.

Source code in packages/canvod-grids/src/canvod/grids/workflows/adapted_workflow.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def load_vod_data(
    self,
    group_name: str = "reference_01_canopy_01",
    branch: str = "main",
) -> xr.Dataset:
    """Load a VOD dataset from the store.

    Parameters
    ----------
    group_name : str
        Zarr group path inside the store.
    branch : str
        Icechunk branch to read from.

    Returns
    -------
    xr.Dataset
        Lazy-loaded VOD dataset.

    """
    logger.info("Loading VOD data from branch=%s group=%s", branch, group_name)
    with self.vod_store.readonly_session(branch=branch) as session:
        vod_ds = xr.open_zarr(session.store, group=group_name, consolidated=False)
    logger.info("Loaded VOD dataset: %s", dict(vod_ds.sizes))
    return vod_ds

check_temporal_coverage_compatibility(main_ds, processed_ds, requested_time_range=None)

Check whether processed_ds adequately covers a time range.

When requested_time_range is None the method checks that the processed dataset covers at least 70 % of the main dataset's span. When a range is given it verifies that both endpoints fall within the processed dataset (with a 1-day tolerance).

Parameters

main_ds : xr.Dataset Reference (unfiltered) dataset. processed_ds : xr.Dataset Filtered dataset to validate. requested_time_range : tuple of date, optional (start, end) to check against.

Returns

compatible : bool coverage_info : dict Diagnostic information with main_range, processed_range, and requested_range.

Source code in packages/canvod-grids/src/canvod/grids/workflows/adapted_workflow.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
def check_temporal_coverage_compatibility(
    self,
    main_ds: xr.Dataset,
    processed_ds: xr.Dataset,
    requested_time_range: tuple[datetime.date, datetime.date] | None = None,
) -> tuple[bool, dict[str, Any]]:
    """Check whether *processed_ds* adequately covers a time range.

    When *requested_time_range* is ``None`` the method checks that the
    processed dataset covers at least 70 % of the main dataset's span.
    When a range is given it verifies that both endpoints fall within the
    processed dataset (with a 1-day tolerance).

    Parameters
    ----------
    main_ds : xr.Dataset
        Reference (unfiltered) dataset.
    processed_ds : xr.Dataset
        Filtered dataset to validate.
    requested_time_range : tuple of date, optional
        ``(start, end)`` to check against.

    Returns
    -------
    compatible : bool
    coverage_info : dict
        Diagnostic information with ``main_range``, ``processed_range``,
        and ``requested_range``.

    """

    def _date_range(ds: xr.Dataset) -> tuple[datetime.date, datetime.date]:
        """Return the date range for a dataset.

        Parameters
        ----------
        ds : xr.Dataset
            Dataset with an epoch coordinate.

        Returns
        -------
        tuple[datetime.date, datetime.date]
            Start and end dates.

        """
        return (
            pd.to_datetime(ds.epoch.min().values).date(),
            pd.to_datetime(ds.epoch.max().values).date(),
        )

    main_start, main_end = _date_range(main_ds)
    proc_start, proc_end = _date_range(processed_ds)

    coverage_info: dict[str, Any] = {
        "main_range": (main_start, main_end),
        "processed_range": (proc_start, proc_end),
        "requested_range": requested_time_range,
    }

    if requested_time_range is None:
        main_days = (main_end - main_start).days
        proc_days = (proc_end - proc_start).days
        ratio = proc_days / main_days if main_days > 0 else 0.0
        logger.info(
            "Coverage check: main=%d days, processed=%d days, ratio=%.1f%%",
            main_days,
            proc_days,
            ratio * 100,
        )
        return ratio >= 0.7, coverage_info

    req_start, req_end = requested_time_range
    one_day = datetime.timedelta(days=1)
    start_ok = proc_start <= req_start <= proc_end + one_day
    end_ok = proc_start - one_day <= req_end <= proc_end
    compatible = start_ok and end_ok

    if not compatible:
        logger.warning(
            "Temporal coverage mismatch: processed=%s%s, requested=%s%s",
            proc_start,
            proc_end,
            req_start,
            req_end,
        )
    return compatible, coverage_info

create_processed_data_fast_hampel_complete(start_date, end_date, force_recreate=False, window_hours=1.0, sigma_threshold=3.0, min_points=5, ultra_fast_mode=False, cell_batch_size=200, n_workers=None)

Run the vectorised / ultra-fast Hampel pipeline end-to-end.

Delegates the actual filtering to :func:canvod.grids.analysis.sigma_clip_filter.astropy_hampel_vectorized_fast (or its ultra-fast variant) and persists the result on a processing branch.

Parameters

start_date, end_date : date or datetime Temporal extent to process. force_recreate : bool Overwrite existing filtered data. window_hours : float Hampel temporal window in hours. sigma_threshold : float MAD-based outlier threshold. min_points : int Minimum observations required per window. ultra_fast_mode : bool Use the pure-NumPy sigma-clip path (faster, less precise). cell_batch_size : int Number of cells per spatial batch. n_workers : int, optional Parallel workers. None → auto-detect.

Returns

str or None Icechunk snapshot ID, or None if existing data was kept.

Source code in packages/canvod-grids/src/canvod/grids/workflows/adapted_workflow.py
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
def create_processed_data_fast_hampel_complete(
    self,
    start_date: datetime.date | datetime.datetime,
    end_date: datetime.date | datetime.datetime,
    force_recreate: bool = False,
    window_hours: float = 1.0,
    sigma_threshold: float = 3.0,
    min_points: int = 5,
    ultra_fast_mode: bool = False,
    cell_batch_size: int = 200,
    n_workers: int | None = None,
) -> str | None:
    """Run the vectorised / ultra-fast Hampel pipeline end-to-end.

    Delegates the actual filtering to
    :func:`canvod.grids.analysis.sigma_clip_filter.astropy_hampel_vectorized_fast`
    (or its ultra-fast variant) and persists the result on a
    ``processing`` branch.

    Parameters
    ----------
    start_date, end_date : date or datetime
        Temporal extent to process.
    force_recreate : bool
        Overwrite existing filtered data.
    window_hours : float
        Hampel temporal window in hours.
    sigma_threshold : float
        MAD-based outlier threshold.
    min_points : int
        Minimum observations required per window.
    ultra_fast_mode : bool
        Use the pure-NumPy sigma-clip path (faster, less precise).
    cell_batch_size : int
        Number of cells per spatial batch.
    n_workers : int, optional
        Parallel workers.  ``None`` → auto-detect.

    Returns
    -------
    str or None
        Icechunk snapshot ID, or ``None`` if existing data was kept.

    """
    return _create_processed_data_fast_hampel(
        workflow_instance=self,
        start_date=start_date,
        end_date=end_date,
        force_recreate=force_recreate,
        window_hours=window_hours,
        sigma_threshold=sigma_threshold,
        min_points=min_points,
        ultra_fast_mode=ultra_fast_mode,
        cell_batch_size=cell_batch_size,
        n_workers=n_workers,
    )

create_processed_data_hampel_parallel_complete(start_date, end_date, force_recreate=False, threshold=3.0, min_obs_per_sid=20, spatial_batch_size=500, n_workers=None, temporal_agg=None, agg_method=None)

Run the parallelised cell-SID Hampel pipeline end-to-end.

Loads the complete requested time range (no temporal chunking) and applies :func:canvod.grids.analysis.hampel_filtering.aggr_hampel_cell_sid_parallelized with spatial batching.

Parameters

start_date, end_date : date or datetime Temporal extent to process. force_recreate : bool Overwrite existing filtered data. threshold : float MAD-based outlier threshold. min_obs_per_sid : int Minimum observations per cell-SID combination. spatial_batch_size : int Cells per spatial batch. n_workers : int, optional Parallel workers. None → auto-detect. temporal_agg : str, optional Post-filtering aggregation frequency (e.g. '1H', '1D'). agg_method : str, optional Aggregation method (e.g. 'mean').

Returns

str or None Icechunk snapshot ID, or None if existing data was kept.

Source code in packages/canvod-grids/src/canvod/grids/workflows/adapted_workflow.py
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
def create_processed_data_hampel_parallel_complete(
    self,
    start_date: datetime.date | datetime.datetime,
    end_date: datetime.date | datetime.datetime,
    force_recreate: bool = False,
    threshold: float = 3.0,
    min_obs_per_sid: int = 20,
    spatial_batch_size: int = 500,
    n_workers: int | None = None,
    temporal_agg: str | None = None,
    agg_method: str | None = None,
) -> str | None:
    """Run the parallelised cell-SID Hampel pipeline end-to-end.

    Loads the complete requested time range (no temporal chunking) and
    applies
    :func:`canvod.grids.analysis.hampel_filtering.aggr_hampel_cell_sid_parallelized`
    with spatial batching.

    Parameters
    ----------
    start_date, end_date : date or datetime
        Temporal extent to process.
    force_recreate : bool
        Overwrite existing filtered data.
    threshold : float
        MAD-based outlier threshold.
    min_obs_per_sid : int
        Minimum observations per cell-SID combination.
    spatial_batch_size : int
        Cells per spatial batch.
    n_workers : int, optional
        Parallel workers.  ``None`` → auto-detect.
    temporal_agg : str, optional
        Post-filtering aggregation frequency (e.g. ``'1H'``, ``'1D'``).
    agg_method : str, optional
        Aggregation method (e.g. ``'mean'``).

    Returns
    -------
    str or None
        Icechunk snapshot ID, or ``None`` if existing data was kept.

    """
    from canvod.grids import create_hemigrid
    from canvod.grids.analysis.hampel_filtering import (
        aggr_hampel_cell_sid_parallelized,
    )
    from canvod.grids.operations import add_cell_ids_to_ds_fast

    logger.info("=" * 60)
    logger.info("PARALLEL HAMPEL — complete temporal coverage")
    logger.info(
        "Range: %s%s | threshold=%.1f | min_obs=%d | batch=%d | workers=%s",
        start_date,
        end_date,
        threshold,
        min_obs_per_sid,
        spatial_batch_size,
        n_workers or "auto",
    )

    # --- guard: existing data ---
    if not self._force_or_skip("processing", force_recreate):
        return None

    # --- load complete time range ---
    logger.info("Loading complete time range for parallel processing")
    with self.vod_store.readonly_session(branch="main") as session:
        vod_ds = xr.open_zarr(session.store, group="reference_01_canopy_01")

    vod_ds_complete = vod_ds.sel(epoch=slice(start_date, end_date))

    if "cell_id_equal_area_2deg" not in vod_ds_complete:
        grid = create_hemigrid(grid_type="equal_area", angular_resolution=2)
        vod_ds_complete = add_cell_ids_to_ds_fast(
            vod_ds_complete, grid, "equal_area_2deg", data_var="VOD"
        )

    logger.info("Dataset loaded: %s", dict(vod_ds_complete.sizes))

    # --- filter ---
    t0 = time.time()
    vod_ds_filtered = aggr_hampel_cell_sid_parallelized(
        vod_ds_complete,
        threshold=threshold,
        min_obs_per_sid=min_obs_per_sid,
        spatial_batch_size=spatial_batch_size,
        n_workers=n_workers,
        temporal_agg=temporal_agg,
        agg_method=agg_method,
    )
    logger.info("Parallel filtering completed in %.1f s", time.time() - t0)

    # --- persist ---
    snapshot_id = self._persist_filtered(
        vod_ds_filtered,
        "parallel Cell-SID Hampel",
    )
    logger.info("Parallel Hampel complete. Snapshot: %s", snapshot_id)
    return snapshot_id

run_complete_workflow(group_name='reference_01_canopy_01', branch='auto', time_range=None, **kwargs)

Orchestrate a complete analysis run.

Auto-detection logic (branch='auto') looks for Hampel-filtered data on the processing branch first. If found and temporally compatible it is used directly; otherwise raw data from main is returned.

Parameters

group_name : str Zarr group for the raw VOD data. branch : str 'auto' for detection, or an explicit branch name. time_range : tuple of date, optional (start, end) to select.

Returns

dict Keys: final_data (Dataset), source_branch, pre_filtered (bool), filter_type.

Source code in packages/canvod-grids/src/canvod/grids/workflows/adapted_workflow.py
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
def run_complete_workflow(
    self,
    group_name: str = "reference_01_canopy_01",
    branch: str = "auto",
    time_range: tuple[datetime.date, datetime.date] | None = None,
    **kwargs: Any,
) -> dict[str, Any]:
    """Orchestrate a complete analysis run.

    Auto-detection logic (``branch='auto'``) looks for Hampel-filtered
    data on the ``processing`` branch first.  If found and temporally
    compatible it is used directly; otherwise raw data from ``main`` is
    returned.

    Parameters
    ----------
    group_name : str
        Zarr group for the raw VOD data.
    branch : str
        ``'auto'`` for detection, or an explicit branch name.
    time_range : tuple of date, optional
        ``(start, end)`` to select.

    Returns
    -------
    dict
        Keys: ``final_data`` (Dataset), ``source_branch``,
        ``pre_filtered`` (bool), ``filter_type``.

    """
    logger.info("=" * 60)
    logger.info("HAMPEL-FILTERED VOD ANALYSIS WORKFLOW")
    logger.info("branch=%s group=%s time_range=%s", branch, group_name, time_range)

    results: dict[str, Any] = {}

    if branch == "auto":
        hampel_ds = self._try_load_hampel()

        if hampel_ds is not None:
            # Validate temporal coverage when a range is requested
            if time_range is not None:
                dataset_start = pd.to_datetime(hampel_ds.epoch.min().values).date()
                dataset_end = pd.to_datetime(hampel_ds.epoch.max().values).date()

                start_ok = normalize_datetime_for_comparison(
                    time_range[0]
                ) >= normalize_datetime_for_comparison(dataset_start)
                end_ok = normalize_datetime_for_comparison(
                    time_range[1]
                ) <= normalize_datetime_for_comparison(dataset_end)

                if start_ok and end_ok:
                    hampel_ds = hampel_ds.sel(
                        epoch=slice(time_range[0], time_range[1])
                    )
                else:
                    logger.warning(
                        "Hampel data (%s%s) does not cover requested "
                        "range (%s%s); falling back to main branch",
                        dataset_start,
                        dataset_end,
                        time_range[0],
                        time_range[1],
                    )
                    hampel_ds = None

            if hampel_ds is not None:
                logger.info("Using Hampel filtered data from processing branch")
                return {
                    "final_data": hampel_ds,
                    "source_branch": "processing",
                    "pre_filtered": True,
                    "filter_type": "hampel",
                }

        logger.info("No usable Hampel data found; using raw data from main branch")
        branch = "main"

    # --- main branch (raw) ---
    logger.info("Loading raw data from branch=%s", branch)
    vod_ds = self.load_vod_data(group_name, branch)

    if time_range is not None:
        vod_ds = vod_ds.sel(epoch=slice(time_range[0], time_range[1]))

    results = {
        "final_data": vod_ds,
        "source_branch": branch,
        "pre_filtered": False,
        "filter_type": "none",
    }
    logger.info("Workflow complete — source=%s", branch)
    return results

normalize_datetime_for_comparison(dt)

Coerce date to datetime at midnight for safe comparison.

Parameters

dt : datetime.date | datetime.datetime Date-like input.

Returns

datetime.datetime Datetime at midnight.

Source code in packages/canvod-grids/src/canvod/grids/workflows/adapted_workflow.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def normalize_datetime_for_comparison(
    dt: datetime.date | datetime.datetime,
) -> datetime.datetime:
    """Coerce date to datetime at midnight for safe comparison.

    Parameters
    ----------
    dt : datetime.date | datetime.datetime
        Date-like input.

    Returns
    -------
    datetime.datetime
        Datetime at midnight.

    """
    if isinstance(dt, datetime.date) and not isinstance(dt, datetime.datetime):
        return datetime.datetime.combine(dt, datetime.time.min)
    return dt

get_workflow_for_store(vod_store_path)

Create an :class:AdaptedVODWorkflow for the given store path.

Parameters

vod_store_path : Path or str Path to VOD Icechunk store.

Returns

AdaptedVODWorkflow

Source code in packages/canvod-grids/src/canvod/grids/workflows/adapted_workflow.py
703
704
705
706
707
708
709
710
711
712
713
714
715
716
def get_workflow_for_store(vod_store_path: Path | str) -> AdaptedVODWorkflow:
    """Create an :class:`AdaptedVODWorkflow` for the given store path.

    Parameters
    ----------
    vod_store_path : Path or str
        Path to VOD Icechunk store.

    Returns
    -------
    AdaptedVODWorkflow

    """
    return AdaptedVODWorkflow(vod_store_path)

check_processed_data_status(vod_store_path)

Introspect the store for Hampel-filtered data.

Parameters

vod_store_path : Path or str Path to VOD Icechunk store.

Returns

dict Keys: has_processing_branch, has_hampel_data, temporal_coverage (tuple of dates or None), data_size (dict or None).

Source code in packages/canvod-grids/src/canvod/grids/workflows/adapted_workflow.py
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
def check_processed_data_status(vod_store_path: Path | str) -> dict[str, Any]:
    """Introspect the store for Hampel-filtered data.

    Parameters
    ----------
    vod_store_path : Path or str
        Path to VOD Icechunk store.

    Returns
    -------
    dict
        Keys: ``has_processing_branch``, ``has_hampel_data``,
        ``temporal_coverage`` (tuple of dates or ``None``),
        ``data_size`` (dict or ``None``).

    """
    workflow = AdaptedVODWorkflow(vod_store_path)

    status: dict[str, Any] = {
        "has_processing_branch": False,
        "has_hampel_data": False,
        "temporal_coverage": None,
        "data_size": None,
    }

    try:
        with workflow.vod_store.readonly_session(branch="processing") as session:
            hampel_ds = xr.open_zarr(
                session.store,
                group="reference_01_canopy_01_hampel_filtered",
                consolidated=False,
            )

        status["has_processing_branch"] = True
        status["has_hampel_data"] = True
        status["temporal_coverage"] = (
            pd.to_datetime(hampel_ds.epoch.min().values).date(),
            pd.to_datetime(hampel_ds.epoch.max().values).date(),
        )
        status["data_size"] = dict(hampel_ds.sizes)

    except Exception as exc:
        status["error"] = str(exc)

    return status